In [1]:
# BERT_BASE_DIR='bert_roman_urdu'

# !transformers-cli convert --model_type bert \
#   --tf_checkpoint $BERT_BASE_DIR/model.ckpt-100000 \
#   --config $BERT_BASE_DIR/config.json \
#   --pytorch_dump_output $BERT_BASE_DIR/pytorch_model.bin

In [2]:
import torch
import pandas as pd
import numpy as np
from datasets import load_metric
from sklearn.model_selection import train_test_split
from transformers import BertForSequenceClassification,BertForQuestionAnswering, BertTokenizerFast, Trainer, TrainingArguments
from transformers.trainer_utils import IntervalStrategy
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
DATA_ROOT = '../glue-urdu/'
TASK = 'QuAD'
task_params = {
    'NER':[],
    'NLI':[],
    'POS':[],
    'QuAD':{'batch_size':32, 'epochs':3},
    'SentiMix':{'batch_size':64, 'epochs':3}
}
assert TASK in task_params.keys()
hyperparams = task_params[TASK]
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_path = 'multilingual_vocab_augmentation_400000_steps'
if TASK == 'QuAD':
    model = BertForQuestionAnswering.from_pretrained(model_path)
else:
    model = BertForSequenceClassification.from_pretrained(model_path, num_labels=3)
tokenizer = BertTokenizerFast.from_pretrained(model_path)

Some weights of the model checkpoint at multilingual_vocab_augmentation_400000_steps were not used when initializing BertForQuestionAnswering: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForQuestionAnswering were not i

In [4]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item['labels'] = torch.tensor(self.labels[idx])
        else: item['labels'] = self
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

def answer_to_idx(answers, contexts):
    result = []
    for answer, context in zip(answers, contexts):
        start_idx = int(answer[0])
        end_idx = start_idx + len(answer[1])

        for i in range(1, 3):
            if context[start_idx - i:end_idx - i] == answer[1]:
                start_idx -= i
                end_idx -= i
                break
        result.append([start_idx, end_idx])

    return result

def char_to_token_position(encodings, answers):
    start_positions = []
    end_positions = []
    for i in range(len(answers)):
        start_positions.append(encodings.char_to_token(i, answers[i][0]) or 128)
        end_positions.append(encodings.char_to_token(i, answers[i][1] - 1) or 128)

    return dict(encodings, **{'start_positions': start_positions, 'end_positions': end_positions})


def getSentiMix(path):
    senti_mix_train = pd.read_csv(path+'SentiMix/Roman Urdu/SentiMix.train.ru.csv')
    senti_mix_test = pd.read_csv(path+'SentiMix/Roman Urdu/SentiMix.test.ru.csv')
    sentiment_categorical = senti_mix_train['sentiment'].astype('category').cat
    class_names = list(sentiment_categorical.categories)

    sentences_train = list(senti_mix_train.sentence)
    labels_train = list(sentiment_categorical.codes)

    X_test = list(senti_mix_test.sentence)
    y_test = list(senti_mix_test['sentiment'].astype('category').cat.codes)

    X_train, X_val, y_train, y_val = train_test_split(sentences_train, labels_train, test_size=0.1)
    encodings = []
    for data in [X_train,X_val,X_test]:
        encodings.append(tokenizer(data,padding='max_length', truncation=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=128))

    return {'train': CustomDataset(encodings[0],y_train),
            'validation': CustomDataset(encodings[1],y_val),
            'test': CustomDataset(encodings[2],y_test),
            'classes':class_names}

def getNLI(path):
    data_dict = {}
    for i in ['train','dev','test']:
        dataframe = pd.read_csv(path+'NLI/Roman Urdu/NLI.ru.{}.tsv'.format(i),sep='\t')
        sentences = dataframe[['premise','hypo']].to_numpy()
        categorical = dataframe['Label'].astype('category').cat
        labels = list(categorical.codes)
        data = list(map(str.strip,sentences[:,0])),list(map(str.strip,sentences[:,1]))
        encodings = tokenizer(*data,padding='max_length', truncation=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=128)
        data_dict[i] = CustomDataset(encodings,labels)
        if i == 'train':
            data_dict['classes'] = list(categorical.categories)

    return data_dict

def getNER(path):
    return

def getPOS(path):
    return

def getQuAD(path):
    dataframe = pd.read_csv(path + 'QuAD/Roman Urdu/QuAD.ru.csv', sep=r"\s\|\s", engine='python')
    sentences = dataframe[["paragraph", "question"]].to_numpy()
    answers = dataframe[["answer starting idx", "answer"]].to_numpy()

    y = answer_to_idx(answers, sentences[:, 0])

    X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size=0.2, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125,random_state=1)  # 0.125 * 0.8 = 0.1

    encodings = []
    for data in [X_train, X_val, X_test]:
        encodings.append(
            tokenizer(list(data[:, 0]), list(data[:, 1]), padding='max_length', truncation='only_first', add_special_tokens=True,
                      return_attention_mask=True,
                      return_tensors="pt", max_length=128))

    return {'train': CustomDataset(char_to_token_position(encodings[0], y_train)),
            'validation': CustomDataset(char_to_token_position(encodings[1], y_val)),
            'test': CustomDataset(char_to_token_position(encodings[2], y_test))}


In [5]:
fine_tune_dataset = locals()['get'+TASK](DATA_ROOT)
squad_metric = load_metric("squad")

def compute_metrics(pred):
    if TASK == 'QuAD':
        answer_start_ids = np.argmax(pred.predictions[0],axis=-1)
        answer_end_ids = np.argmax(pred.predictions[1],axis=-1)
        return squad_metric.compute(predictions=list(zip(answer_start_ids,answer_end_ids))
                                    ,references=list(zip(*pred.label_ids)))
    else:
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

training_args = TrainingArguments(
    output_dir='fine_tune_results/{}'.format(TASK),  # output directory
    overwrite_output_dir=True,
    num_train_epochs=hyperparams['epochs'],  # total number of training epochs
    per_device_train_batch_size=hyperparams['batch_size'],  # batch size per device during training
    per_device_eval_batch_size=hyperparams['batch_size'],  # batch size for evaluation
    warmup_steps=60,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=1,
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps = 10,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1'
)

trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=fine_tune_dataset['train'],  # training dataset
    eval_dataset=fine_tune_dataset['validation'],  # evaluation dataset
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)




In [6]:
trainer.train()

  import sys


Step,Training Loss,Validation Loss


KeyError: 23

In [None]:
trainer.evaluate(fine_tune_dataset['test'])

In [None]:
trainer.save_model('fine_tune_results/{}/best-checkpoint_f1'.format(TASK))

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs
