In [1]:
# base on https://huggingface.co/docs/transformers/training

In [2]:
!rm -rf ojt_bert
!git clone https://github.com/wzwzeyal/ojt_bert

Cloning into 'ojt_bert'...
remote: Enumerating objects: 251, done.[K
remote: Counting objects: 100% (251/251), done.[K
remote: Compressing objects: 100% (177/177), done.[K
remote: Total 251 (delta 118), reused 198 (delta 68), pack-reused 0[K
Receiving objects: 100% (251/251), 9.34 MiB | 15.50 MiB/s, done.
Resolving deltas: 100% (118/118), done.


In [3]:
!pip install datasets transformers 



In [4]:
from datasets import load_dataset, DatasetDict, Dataset

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report
import matplotlib.pyplot as plt
import torch
from torch.utils.tensorboard import SummaryWriter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer


2022-01-25 14:16:11.756070: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


# Configuration

In [5]:
MODEL_CKPT = "onlplab/alephbert-base"
# MODEL_CKPT = "avichr/heBERT_sentiment_analysis"
TEXT_COLUMN_NAME = "comment"
LABEL_COLUMN_NAME = "label"
# SENTIMENT_COLUMN_NAME = "sentiment"
# HUGGINGFACE_LABEL_COLUMN_NAME = "labels"
NUM_LABELS = 3
# TRAIN_BATCH_SIZE = 8
# EVAL_BATCH_SIZE = 8
NUM_EPOCS = 4

# Preparing the datasets


In [6]:


train_df = pd.read_csv('../data/for_sentiment/train_token_df.gz').head(512)
test_df = pd.read_csv('../data/for_sentiment/val_token_df.gz').head(64)


In [7]:
# raw_datasets = load_dataset("imdb")

In [8]:
# f1 = f1_score(y_true, y_pred, average='weighted') * 100
# f1

# # accuracy_score, precision_score, recall_score
# val_acc = accuracy_score(y_true, y_pred) * 100
# val_acc

# precision = precision_score(y_true, y_pred, average='weighted') * 100
# precision

# recall = recall_score(y_true, y_pred, average='weighted') * 100
# recall

# target_names = ['pos', 'neg', 'nut']
# print(classification_report(y_true, y_pred, target_names=target_names))

# # model.to('cuda')
# # trainer.predict(test_dataset = tokenized_datasets['test'])

In [9]:
class hf_trainer_wrapper():
    def __init__(self,
                 model_ckpt,
                 train_df,
                 test_df,
                 val_df,
                 text_col_name,
                 label_col_name,
                 num_train_epochs,
                 num_labels=3):
        self.train_df = train_df
        self.test_df = test_df
        self.val_df = val_df
        
        self.text_col_name = text_col_name
        self.label_col_name = label_col_name
        
        raw_datasets = DatasetDict()
        raw_datasets["train"] = Dataset.from_pandas(train_df[[text_col_name, label_col_name]])
        raw_datasets["test"] = Dataset.from_pandas(test_df[[text_col_name, label_col_name]])
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
        self.tokenized_datasets = raw_datasets.map(self.tokenize_function, batched=True)
        self.model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels)
        
        training_args = TrainingArguments(
            model_ckpt,
            num_train_epochs = num_train_epochs
        )
        self.trainer = Trainer(
            model=self.model, 
            args=training_args, 
            train_dataset=self.tokenized_datasets["train"],
            eval_dataset=self.tokenized_datasets["test"],
            
        )
        self.df_res = pd.DataFrame()
        
    def tokenize_function(self, sentences):
        return self.tokenizer(sentences[self.text_col_name], padding="max_length", truncation=True)
    
    def train(self):
        self.trainer.train()

    def evaluate(self):
        self.df_res['y_true'] = test_df[self.label_col_name]
        self.df_res['y_pred'] = test_df[self.text_col_name].apply(lambda x: self.sentiment_score(x))
        
        
#         f1 = f1_score(y_true, y_pred, average='weighted') * 100
#         val_acc = accuracy_score(y_true, y_pred) * 100
#         precision = precision_score(y_true, y_pred, average='weighted') * 100
#         recall = recall_score(y_true, y_pred, average='weighted') * 100


        target_names = ['pos', 'neg', 'nut']
        print(classification_report(self.df_res['y_true'], self.df_res['y_pred'], target_names=target_names))

        
        
    def sentiment_score(self, review):
        tokens = self.tokenizer.encode(review, return_tensors='pt').to('cuda')
        result = self.model(tokens)
        return int(torch.argmax(result.logits))
        
    

In [10]:
hf_trainer = hf_trainer_wrapper(
    MODEL_CKPT,
    train_df,
    test_df,
    test_df,
    TEXT_COLUMN_NAME,
    LABEL_COLUMN_NAME,
    NUM_EPOCS)

file onlplab/alephbert-base/config.json not found


OSError: Can't load config for 'onlplab/alephbert-base'. Make sure that:

- 'onlplab/alephbert-base' is a correct model identifier listed on 'https://huggingface.co/models'
  (make sure 'onlplab/alephbert-base' is not a path to a local directory with something else, in that case)

- or 'onlplab/alephbert-base' is the correct path to a directory containing a config.json file



In [None]:
hf_trainer.train()

In [None]:
hf_trainer.evaluate()