In [1]:
# BERT_BASE_DIR='bert_roman_urdu'

# !transformers-cli convert --model_type bert \
#   --tf_checkpoint $BERT_BASE_DIR/model.ckpt-100000 \
#   --config $BERT_BASE_DIR/config.json \
#   --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin

In [26]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification, BertTokenizer, Trainer, TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from torch.utils.data import DataLoader
from transformers import AdamW
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [27]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model = BertForSequenceClassification.from_pretrained('results_senti_mix/best-checkpoint_f1',num_labels=3)
# model = BertForSequenceClassification.from_pretrained('bert_roman_urdu', num_labels=3)
tokenizer = BertTokenizer.from_pretrained('bert_roman_urdu')

In [28]:
senti_mix_train = pd.read_csv('fine_tuning_data/SentiMix_train_ru.csv')
senti_mix_test = pd.read_csv('fine_tuning_data/SentiMix_test_ru.csv')

sentiment_categorical = senti_mix_train['sentiment'].astype('category').cat
class_names = list(sentiment_categorical.categories)

sentences_train = list(senti_mix_train.sentence)
sentiment_train = list(sentiment_categorical.codes)

X_test = list(senti_mix_test.sentence)
y_test = list(senti_mix_test['sentiment'].astype('category').cat.codes)

X_train, X_val, y_train, y_val = train_test_split(sentences_train, sentiment_train, test_size=0.1)

train_encodings = tokenizer(X_train, truncation=True, padding=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=128)
val_encodings = tokenizer(X_val, truncation=True, padding=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=128)
test_encodings = tokenizer(X_test, truncation=True, padding=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=128)

In [29]:
class SentiMixDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = SentiMixDataset(train_encodings, y_train)
val_dataset = SentiMixDataset(val_encodings, y_val)
test_dataset = SentiMixDataset(test_encodings, y_test)


In [30]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='/media/usama/48E0E582E0E5769A/Users/AIM-LAB-SERVER/Desktop/results_senti_mix',  # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    warmup_steps=60,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=1,
    evaluation_strategy=EvaluationStrategy.STEPS,
    eval_steps = 10,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1'
)

trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=train_dataset,  # training dataset
    eval_dataset=val_dataset,  # evaluation dataset
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


In [7]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
10,1.1377,1.10379,0.352941,0.318199,0.364762,0.352941,2.4932,681.844
20,1.0736,1.070576,0.414706,0.384199,0.424678,0.414706,2.5201,674.584
30,1.0453,1.031316,0.474706,0.463536,0.511092,0.474706,2.5246,673.373
40,0.9967,0.986562,0.501765,0.502409,0.507116,0.501765,2.5158,675.735
50,0.9373,0.949216,0.541765,0.54009,0.548061,0.541765,2.509,677.565
60,0.9627,0.909383,0.554118,0.551958,0.553031,0.554118,2.51,677.302
70,0.9928,0.885858,0.562353,0.549311,0.562537,0.562353,2.5102,677.238
80,0.8605,0.857858,0.580588,0.57758,0.581693,0.580588,2.529,672.209
90,0.8132,0.842583,0.598824,0.595259,0.59423,0.598824,2.5279,672.49
100,0.9451,0.832873,0.604706,0.601682,0.601114,0.604706,2.5309,671.71








TrainOutput(global_step=600, training_loss=0.6564594264576833, metrics={'train_runtime': 867.6076, 'train_samples_per_second': 0.692, 'total_flos': 3165964645599000, 'epoch': 5.0})

In [31]:
trainer.evaluate(test_dataset)



{'eval_loss': 0.6919655203819275,
 'eval_accuracy': 0.6953333333333334,
 'eval_f1': 0.6963905876614596,
 'eval_precision': 0.6999825608500596,
 'eval_recall': 0.6953333333333334,
 'eval_runtime': 5.0704,
 'eval_samples_per_second': 591.667}

In [9]:
trainer.save_model('/media/usama/48E0E582E0E5769A/Users/AIM-LAB-SERVER/Desktop/results_senti_mix/best-checkpoint_f1')

In [10]:
%load_ext tensorboard
%tensorboard --logdir logs


Reusing TensorBoard on port 6006 (pid 17868), started 1 day, 17:47:06 ago. (Use '!kill 17868' to kill it.)