In [1]:
# BERT_BASE_DIR='bert_roman_urdu'

# !transformers-cli convert --model_type bert \
#   --tf_checkpoint $BERT_BASE_DIR/model.ckpt-100000 \
#   --config $BERT_BASE_DIR/config.json \
#   --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin

In [1]:
import torch
import pandas as pd
from datasets import load_metric
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification,BertForQuestionAnswering, BertTokenizerFast, Trainer, TrainingArguments
from transformers.trainer_utils import EvaluationStrategy
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [2]:
DATA_ROOT = '../glue-urdu/'
TASK = 'QuAD'
assert TASK in ['NER','NLI','POS','QuAD','SentiMix']
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_path = 'bert-base-uncased'
if TASK == 'QuAD':
    model = BertForQuestionAnswering.from_pretrained(model_path)
else:
    model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

file bert-base-uncased/config.json not found


OSError: Can't load config for 'bert-base-uncased'. Make sure that:

- 'bert-base-uncased' is a correct model identifier listed on 'https://huggingface.co/models'

- or 'bert-base-uncased' is the correct path to a directory containing a config.json file



In [29]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

def answer_to_idx(answers, contexts):
    result = []
    for answer, context in zip(answers, contexts):
        start_idx = int(answer[0])
        end_idx = start_idx + len(answer[1])

        for i in range(1, 3):
            if context[start_idx - i:end_idx - i] == answer[1]:
                start_idx -= i
                end_idx -= i
                break
        result.append([start_idx, end_idx])

    return result

def char_to_token_position(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i][0]) or 128)
        end_positions.append(encodings.char_to_token(i, answers[i][1] - 1) or 128)

    return dict(encodings, **{'start_positions': start_positions, 'end_positions': end_positions})


def getSentiMix(path):
    senti_mix_train = pd.read_csv(path+'SentiMix/Roman Urdu/SentiMix.train.ru.csv')
    senti_mix_test = pd.read_csv(path+'SentiMix/Roman Urdu/SentiMix.test.ru.csv')
    sentiment_categorical = senti_mix_train['sentiment'].astype('category').cat
    class_names = list(sentiment_categorical.categories)

    sentences_train = list(senti_mix_train.sentence)
    labels_train = list(sentiment_categorical.codes)

    X_test = list(senti_mix_test.sentence)
    y_test = list(senti_mix_test['sentiment'].astype('category').cat.codes)

    X_train, X_val, y_train, y_val = train_test_split(sentences_train, labels_train, test_size=0.1)
    encodings = []
    for data in [X_train,X_val,X_test]:
        encodings.append(tokenizer(data,padding='max_length', truncation=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=128))

    return {'train': CustomDataset(encodings[0],y_train),
            'validation': CustomDataset(encodings[1],y_val),
            'test': CustomDataset(encodings[2],y_test),
            'classes':class_names}

def getNLI(path):
    data_dict = {}
    for i in ['train','dev','test']:
        dataframe = pd.read_csv(path+'NLI/Roman Urdu/NLI.ru.{}.tsv'.format(i),sep='\t')
        sentences = dataframe[['premise','hypo']].to_numpy()
        categorical = dataframe['Label'].astype('category').cat
        labels = list(categorical.codes)
        data = list(map(str.strip,sentences[:,0])),list(map(str.strip,sentences[:,1]))
        encodings = tokenizer(*data,padding='max_length', truncation=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=128)
        data_dict[i] = CustomDataset(encodings,labels)
        if i == 'train':
            data_dict['classes'] = list(categorical.categories)

    return data_dict

def getNER(path):
    return

def getPOS(path):
    return

def getQuAD(path):
    dataframe = pd.read_csv(path + 'QuAD/Roman Urdu/QuAD.ru.csv', sep=r"\s\|\s", engine='python')
    sentences = dataframe[["paragraph", "question"]].to_numpy()
    answers = dataframe[["answer starting idx", "answer"]].to_numpy()

    y = answer_to_idx(answers, sentences[:, 0])

    X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size=0.2, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125,random_state=1)  # 0.125 * 0.8 = 0.1

    encodings = []
    for data in [X_train, X_val, X_test]:
        encodings.append(
            tokenizer(list(data[:, 0]), list(data[:, 1]), padding='max_length', truncation='only_first', add_special_tokens=True,
                      return_attention_mask=True,
                      return_tensors="pt", max_length=128))

    return {'train': CustomDataset(char_to_token_position(encodings[0], y_train)),
            'validation': CustomDataset(char_to_token_position(encodings[1], y_val)),
            'test': CustomDataset(char_to_token_position(encodings[2], y_test))}


In [30]:
fine_tune_dataset = locals()['get'+TASK](DATA_ROOT)
squad_metric = load_metric("squad")

def compute_metrics(pred):
    if TASK == 'QuAD':
        print(pred.predictions[:3])
        print(pred.label_ids[:3])
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir='fine_tune_results/{}'.format(TASK),  # output directory
    overwrite_output_dir=True,
    num_train_epochs=3,  # total number of training epochs
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,  # batch size for evaluation
    warmup_steps=60,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=1,
    evaluation_strategy=EvaluationStrategy.STEPS,
    eval_steps = 10,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1'
)

trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=fine_tune_dataset['train'],  # training dataset
    eval_dataset=fine_tune_dataset['validation'],  # evaluation dataset
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


In [7]:
trainer.train()



Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
10,1.1377,1.10379,0.352941,0.318199,0.364762,0.352941,2.4932,681.844
20,1.0736,1.070576,0.414706,0.384199,0.424678,0.414706,2.5201,674.584
30,1.0453,1.031316,0.474706,0.463536,0.511092,0.474706,2.5246,673.373
40,0.9967,0.986562,0.501765,0.502409,0.507116,0.501765,2.5158,675.735
50,0.9373,0.949216,0.541765,0.54009,0.548061,0.541765,2.509,677.565
60,0.9627,0.909383,0.554118,0.551958,0.553031,0.554118,2.51,677.302
70,0.9928,0.885858,0.562353,0.549311,0.562537,0.562353,2.5102,677.238
80,0.8605,0.857858,0.580588,0.57758,0.581693,0.580588,2.529,672.209
90,0.8132,0.842583,0.598824,0.595259,0.59423,0.598824,2.5279,672.49
100,0.9451,0.832873,0.604706,0.601682,0.601114,0.604706,2.5309,671.71








TrainOutput(global_step=600, training_loss=0.6564594264576833, metrics={'train_runtime': 867.6076, 'train_samples_per_second': 0.692, 'total_flos': 3165964645599000, 'epoch': 5.0})

In [31]:
trainer.evaluate(fine_tune_dataset['test'])



{'eval_loss': 0.6919655203819275,
 'eval_accuracy': 0.6953333333333334,
 'eval_f1': 0.6963905876614596,
 'eval_precision': 0.6999825608500596,
 'eval_recall': 0.6953333333333334,
 'eval_runtime': 5.0704,
 'eval_samples_per_second': 591.667}

In [9]:
trainer.save_model('fine_tune_results/{}/best-checkpoint_f1'.format(TASK))

In [10]:
%load_ext tensorboard
%tensorboard --logdir logs


Reusing TensorBoard on port 6006 (pid 17868), started 1 day, 17:47:06 ago. (Use '!kill 17868' to kill it.)