In [None]:
# base on https://huggingface.co/docs/transformers/training

In [None]:
!rm -rf ojt_bert
!git clone https://github.com/wzwzeyal/ojt_bert

In [3]:
!pip install datasets transformers 



In [4]:
from datasets import load_dataset, DatasetDict, Dataset

import pandas as pd
import numpy as np
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, classification_report
import matplotlib.pyplot as plt
import torch
from torch.utils.tensorboard import SummaryWriter


# Configuration

In [5]:
MODEL_CKPT = "onlplab/alephbert-base"
TEXT_COLUMN_NAME = "comment"
LABEL_COLUMN_NAME = "label"
SENTIMENT_COLUMN_NAME = "sentiment"
HUGGINGFACE_LABEL_COLUMN_NAME = "labels"
NUM_LABELS = 3
TRAIN_BATCH_SIZE = 8
EVAL_BATCH_SIZE = 8
NUM_EPOCS = 1

In [6]:
writer = SummaryWriter(f'../runs/{MODEL_CKPT}')

# Preparing the datasets


In [7]:


train_df = pd.read_csv('../data/for_sentiment/train_token_df.gz').head(1024)
test_df = pd.read_csv('../data/for_sentiment/val_token_df.gz').head(128)
raw_datasets = DatasetDict()
raw_datasets["train"] = Dataset.from_pandas(train_df[[TEXT_COLUMN_NAME, LABEL_COLUMN_NAME]])
raw_datasets["test"] = Dataset.from_pandas(test_df[[TEXT_COLUMN_NAME, LABEL_COLUMN_NAME]])
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['comment', 'label'],
        num_rows: 1024
    })
    test: Dataset({
        features: ['comment', 'label'],
        num_rows: 128
    })
})

In [8]:
# raw_datasets = load_dataset("imdb")

In [9]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_CKPT)

In [10]:
def tokenize_function(examples):
    return tokenizer(examples[TEXT_COLUMN_NAME], padding="max_length", truncation=True)

In [11]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
tokenized_datasets

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['comment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1024
    })
    test: Dataset({
        features: ['comment', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 128
    })
})

In [12]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(MODEL_CKPT, num_labels=NUM_LABELS)

  dtype = data_type(0).dtype
Some weights of the model checkpoint at onlplab/alephbert-base were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpo

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments("test_trainer_2")

In [14]:
from transformers import Trainer


trainer = Trainer(
    model=model, args=training_args, train_dataset=tokenized_datasets["train"], eval_dataset=tokenized_datasets["test"]
)

In [15]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: comment.
***** Running training *****
  Num examples = 1024
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 48


Step,Training Loss




Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=48, training_loss=0.28185691436131793, metrics={'train_runtime': 57.5286, 'train_samples_per_second': 53.4, 'train_steps_per_second': 0.834, 'total_flos': 808284419260416.0, 'train_loss': 0.28185691436131793, 'epoch': 3.0})

In [16]:
def sentiment_score(review):
    tokens = tokenizer.encode(review, return_tensors='pt')
    result = model.to('cpu')(tokens)
    return int(torch.argmax(result.logits))

In [17]:
test_df[SENTIMENT_COLUMN_NAME] = test_df[TEXT_COLUMN_NAME].apply(lambda x: sentiment_score(x))

In [18]:
test_df.head()

Unnamed: 0.1,Unnamed: 0,comment,label,comment_clean,sentiment,comment_clean_len
0,0,מתבייש בך שאתה הנשיא שלי . חשבתי שיש בך קצת יו...,1,מתבייש בך שאתה הנשיא שלי חשבתי שיש בך קצת יותר...,1,72
1,1,מזל טוב לעם ישראל שהנשיא העשירי שנבחר הוא ראוב...,0,מזל טוב לעם ישראל שהנשיא העשירי שנבחר הוא ראוב...,0,94
2,2,מקסים 😊,0,מקסים,0,5
3,4,דניאל כך הכבוד,2,דניאל כך הכבוד,0,14
4,5,אחלה נשיא רובי ....,0,אחלה נשיא רובי,0,14


In [19]:
y_true = test_df[LABEL_COLUMN_NAME]
y_pred = test_df[SENTIMENT_COLUMN_NAME]

In [20]:
f1 = f1_score(y_true, y_pred, average='weighted') * 100
f1

83.04629741240758

In [21]:
# accuracy_score, precision_score, recall_score
val_acc = accuracy_score(y_true, y_pred) * 100
val_acc

83.59375

In [22]:
precision = precision_score(y_true, y_pred, average='weighted') * 100
precision

83.08092474866943

In [23]:
recall = recall_score(y_true, y_pred, average='weighted') * 100
recall

83.59375

In [24]:
target_names = ['pos', 'neg', 'nut']
print(classification_report(y_true, y_pred, target_names=target_names))

              precision    recall  f1-score   support

         pos       0.92      0.87      0.90        94
         neg       0.66      0.83      0.74        30
         nut       0.00      0.00      0.00         4

    accuracy                           0.84       128
   macro avg       0.53      0.57      0.54       128
weighted avg       0.83      0.84      0.83       128



In [25]:
# model.to('cuda')
# trainer.predict(test_dataset = tokenized_datasets['test'])

In [26]:
writer.add_scalars(
                    'metrics',
                    {
                        'val_accuracy' : val_acc,
                        'f1'          : f1,
                        'precision' : precision,
                        'recall' : recall,
                    },
                    )

In [27]:
writer.add_text(tag='classification_report', text_string=classification_report(y_true, y_pred, target_names=target_names))

In [28]:
writer.flush()

In [31]:
writer.add_scalars(
                    'metrics_2',
                    {
                        'val_accuracy' : val_acc * 0.5,
                        'f1'          : f1 * 0.5,
                        'precision' : precision * 0.5,
                        'recall' : recall * 0.5,
                    },
                    )

In [32]:
writer.flush()