In [1]:
MODEL_PATH = 'bilingual_vocab_original'

# !transformers-cli convert --model_type bert \
#   --tf_checkpoint $MODEL_PATH/model.ckpt-100000 \
#   --config $MODEL_PATH/config.json \
#   --pytorch_dump_output $MODEL_PATH/pytorch_model.bin

In [2]:
import torch
import pandas as pd
import numpy as np
from datasets import load_metric
from operator import itemgetter
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments,DataCollatorWithPadding,DataCollatorForTokenClassification,BertForQuestionAnswering,BertForSequenceClassification,BertForTokenClassification,BertTokenizerFast
from transformers.trainer_utils import IntervalStrategy
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
DATA_ROOT = '../glue-urdu/'
TASK = 'POS'
task_params = {
    'NER':{'best_model_metric':'eval_f1','entity_metrics':True,'batch_size':32, 'epochs':3},
    'NLI':{'batch_size':64, 'epochs':3},
    'POS':{'best_model_metric':'eval_f1','entity_metrics':True,'batch_size':16, 'epochs':3},
    'QuAD':{'best_model_metric':'eval_f1','batch_size':32, 'epochs':10},
    'SentiMix':{'best_model_metric':'eval_f1','batch_size':64, 'epochs':3}
}
type_datasets=['train','validation','test']
assert TASK in task_params.keys()
hyperparams = task_params[TASK]
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)

In [4]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if TASK == 'POS':
            item['labels'] = self.labels[idx]
        elif TASK != 'QuAD':
            item['labels'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

def answer_to_idx(answers, contexts):
    result = []
    for answer, context in zip(answers, contexts):
        start_idx = int(answer[0])
        end_idx = start_idx + len(answer[1])

        for i in range(1, 3):
            if context[start_idx - i:end_idx - i+1] == answer[1]:
                start_idx -= i
                end_idx -= i
                break
        result.append([start_idx, end_idx, (start_idx,answer[1])])

    return result

def char_to_token_position(encodings, answers,dataset_type):
    start_positions = []
    end_positions = []
    answer_texts = []
    for i,answer in enumerate(answers):
        start_positions.append(encodings.char_to_token(i, answers[i][0]) or 128)
        end_positions.append(encodings.char_to_token(i, answers[i][1] - 1) or 128)
        answer_texts.append((answer[2],encodings['input_ids'][i]))

    return dict(encodings, **{'start_positions': start_positions, 'end_positions': end_positions}),answer_texts

def encode_tags(tags, encodings, unique_tags):
    labels = [[unique_tags.index(tag) for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset), dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        if sum(arr_offset[-2]) != 0:
            sub_len = sum((arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0))
        else:
            sub_len = len(doc_labels)
        doc_enc_labels[(arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)] = doc_labels[:sub_len]
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

def common_NER_POS(token_tags,num_tags):
    unique_tags = list(set(tag for doc in token_tags[0][1] + token_tags[1][1] for tag in doc))
    assert len(unique_tags) == num_tags
    X_train, X_val, y_train, y_val = train_test_split(*token_tags[0], test_size=0.1)
    X_test, y_test = token_tags[1]

    encodings = []
    labels = []
    for data, label in [(X_train, y_train), (X_val, y_val), (X_test, y_test)]:
        encodings.append(
            tokenizer(list(data), is_split_into_words=True, return_offsets_mapping=True, padding='max_length',
                      truncation=True, add_special_tokens=True, return_attention_mask=True,
                      return_tensors="pt", max_length=128))
        labels.append(encode_tags(label, encodings[-1], unique_tags))
        encodings[-1].pop("offset_mapping")

    CustomDataset.label_list = unique_tags
    return {'train': CustomDataset(encodings[0], labels[0]),
            'validation': CustomDataset(encodings[1], labels[1]),
            'test': CustomDataset(encodings[2], labels[2])}

def getSentiMix(path):
    senti_mix_train = pd.read_csv(path+'SentiMix/Roman Urdu/SentiMix.train.ru.csv')
    senti_mix_test = pd.read_csv(path+'SentiMix/Roman Urdu/SentiMix.test.ru.csv')
    sentiment_categorical = senti_mix_train['sentiment'].astype('category').cat
    class_names = list(sentiment_categorical.categories)

    sentences_train = list(senti_mix_train.sentence)
    labels_train = list(sentiment_categorical.codes)

    X_test = list(senti_mix_test.sentence)
    y_test = list(senti_mix_test['sentiment'].astype('category').cat.codes)

    X_train, X_val, y_train, y_val = train_test_split(sentences_train, labels_train, test_size=0.1)
    encodings = []
    for data in [X_train,X_val,X_test]:
        encodings.append(tokenizer(data,padding='max_length', truncation=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=128))

    return {'train': CustomDataset(encodings[0],y_train),
            'validation': CustomDataset(encodings[1],y_val),
            'test': CustomDataset(encodings[2],y_test),
            'classes':class_names}

def getNLI(path):
    data_dict = {}
    for i in ['train','dev','test']:
        dataframe = pd.read_csv(path+'NLI/Roman Urdu/NLI.ru.{}.tsv'.format(i),sep='\t')
        sentences = dataframe[['premise','hypo']].to_numpy()
        categorical = dataframe['Label'].astype('category').cat
        labels = list(categorical.codes)
        data = list(map(str.strip,sentences[:,0])),list(map(str.strip,sentences[:,1]))
        encodings = tokenizer(*data,padding='max_length', truncation=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=128)
        data_dict[i] = CustomDataset(encodings,labels)
        if i == 'train':
            data_dict['classes'] = list(categorical.categories)

    return data_dict

def getNER(path):
    token_tags = []
    for split_type in ['train', 'test']:
        raw_docs = open(path + 'NER/Roman Urdu/NER.ru.{}'.format(split_type)).read().strip().split('\n\n')
        processed_docs = [list(zip(*[token_tag.split('\t') for token_tag in doc.split('\n')])) for doc in raw_docs]
        token_tags.append(list(zip(*processed_docs)))

    return common_NER_POS(token_tags,7)

def getPOS(path):
    token_tags = []
    for split_type in ['train','dev' ,'test']:
        raw_docs = open(path + 'POS/Roman Urdu/pos.ru.{}.conllu'.format(split_type)).read().strip().split('\n\n')
        processed_docs = [list(zip(*[itemgetter(1,3)(token_tag.split('\t')) for token_tag in doc.split('\n')[2:]])) for doc in raw_docs]
        token_tags.append(list(zip(*processed_docs)))

    return common_NER_POS(token_tags,17)

def getQuAD(path):
    dataframe = pd.read_csv(path + 'QuAD/Roman Urdu/QuAD.ru.csv', sep=r"\s\|\s", engine='python')
    sentences = dataframe[["paragraph", "question"]].to_numpy()
    answers = dataframe[["answer starting idx", "answer"]].to_numpy()

    y = answer_to_idx(answers, sentences[:, 0])

    X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size=0.2, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125,random_state=1)  # 0.125 * 0.8 = 0.1

    encodings = []
    for data in [X_train, X_val, X_test]:
        encodings.append(
            tokenizer(list(data[:, 0]), list(data[:, 1]), padding='max_length', truncation='only_first', add_special_tokens=True,
                      return_attention_mask=True,
                      return_tensors="pt", max_length=128))

    return {'train': CustomDataset(*char_to_token_position(encodings[0], y_train,0)),
            'validation': CustomDataset(*char_to_token_position(encodings[1], y_val,1)),
            'test': CustomDataset(*char_to_token_position(encodings[2], y_test,2))}


In [5]:
fine_tune_dataset = locals()['get'+TASK](DATA_ROOT)
if TASK == 'QuAD':
    metric = load_metric("squad")
elif TASK == 'NER' or TASK == 'POS':
    metric = load_metric("seqeval")

if TASK == 'QuAD':
    model = BertForQuestionAnswering.from_pretrained(MODEL_PATH)
elif TASK == 'NER' or TASK == 'POS':
    model = BertForTokenClassification.from_pretrained(MODEL_PATH,num_labels=len(CustomDataset.label_list))
else:
    model = BertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=3)

def compute_metrics(pred):
    if TASK == 'QuAD':
        answer_start_ids = np.argmax(pred.predictions[0],axis=-1)
        answer_end_ids = np.argmax(pred.predictions[1],axis=-1)
        labels = fine_tune_dataset['validation'].labels
        predictions=[]
        references=[]
        for i,_id in enumerate(range(len(labels))):
            true_answer,encodings = labels[_id]
            predictions.append({'id':str(_id),'prediction_text':tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encodings[answer_start_ids[i]:answer_end_ids[i]]))})
            references.append({'id':str(_id),'answers':{'answer_start':[true_answer[0]],'text':[true_answer[1]]}})

        return metric.compute(predictions=predictions, references=references)
    elif TASK == 'NER' or TASK == 'POS':
        predictions, labels = pred
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions,true_labels = list(zip(*[zip(*map(lambda p_l:itemgetter(*p_l)(CustomDataset.label_list),
                                                           filter(lambda p_l: p_l[1] != -100, zip(prediction,label))))
                                                  for prediction, label in zip(predictions,labels)]))

        results = metric.compute(predictions=true_predictions, references=true_labels)
        if hyperparams['entity_metrics']:
            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }
    else:
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

training_args = TrainingArguments(
    output_dir='fine_tune_results/{}'.format(TASK),  # output directory
    overwrite_output_dir=True,
    num_train_epochs=hyperparams['epochs'],  # total number of training epochs
    per_device_train_batch_size=hyperparams['batch_size'],  # batch size per device during training
    per_device_eval_batch_size=hyperparams['batch_size'],  # batch size for evaluation
    warmup_steps=60,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=1,
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps = 10,
    # load_best_model_at_end=True,
    # metric_for_best_model=hyperparams['best_model_metric']
)

trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=fine_tune_dataset['train'],  # training dataset
    eval_dataset=fine_tune_dataset['validation'],  # evaluation dataset
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer) if TASK == 'NER' or TASK == 'POS' else DataCollatorWithPadding(tokenizer)
)


Some weights of the model checkpoint at bilingual_vocab_original were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized f

In [6]:
trainer.train()

  import sys
  sequence_length = torch.tensor(batch["input_ids"]).shape[1]
  batch = {k: torch.tensor(v, dtype=torch.int64) for k, v in batch.items()}


Step,Training Loss,Validation Loss,Art Precision,Art Recall,Art F1,Art Number,Conj Precision,Conj Recall,Conj F1,Conj Number,Dj Precision,Dj Recall,Dj F1,Dj Number,Dp Precision,Dp Recall,Dp F1,Dp Number,Dv Precision,Dv Recall,Dv F1,Dv Number,Erb Precision,Erb Recall,Erb F1,Erb Number,Et Precision,Et Recall,Et F1,Et Number,Ntj Precision,Ntj Recall,Ntj F1,Ntj Number,Oun Precision,Oun Recall,Oun F1,Oun Number,Ron Precision,Ron Recall,Ron F1,Ron Number,Ropn Precision,Ropn Recall,Ropn F1,Ropn Number,Um Precision,Um Recall,Um F1,Um Number,Unct Precision,Unct Recall,Unct F1,Unct Number,Ux Precision,Ux Recall,Ux F1,Ux Number,Ym Precision,Ym Recall,Ym F1,Ym Number,Precision,Recall,F1,Number,Overall Precision,Overall Recall,Overall F1,Overall Accuracy,Runtime,Samples Per Second
10,2.6952,2.631105,0.012195,0.022624,0.015848,221,0.042056,0.019737,0.026866,456,0.20438,0.037037,0.06271,756,0.197195,0.253044,0.221655,1889,0.045455,0.010417,0.016949,96,0.157143,0.129828,0.142186,932,0.02027,0.064171,0.030809,187,0.0,0.0,0.0,1,0.238733,0.184557,0.208178,2124,0.01626,0.004914,0.007547,407,0.019841,0.050432,0.028478,694,0.0,0.0,0.0,158,0.0,0.0,0.0,517,0.0,0.0,0.0,558,0.0,0.0,0.0,18,0.0,0.0,0.0,2,0.130845,0.12012,0.125253,0.235272,2.3124,175.141
20,2.1429,2.043103,0.0,0.0,0.0,221,0.0,0.0,0.0,456,0.0,0.0,0.0,756,0.37258,0.621493,0.465873,1889,0.0,0.0,0.0,96,0.453846,0.063305,0.111111,932,0.0,0.0,0.0,187,0.0,0.0,0.0,1,0.247326,0.359228,0.292955,2124,0.0,0.0,0.0,407,0.020833,0.001441,0.002695,694,0.0,0.0,0.0,158,0.0,0.0,0.0,517,0.0,0.0,0.0,558,0.0,0.0,0.0,18,0.0,0.0,0.0,2,0.311302,0.221495,0.25883,0.403761,2.2984,176.207
30,1.4516,1.303161,0.0,0.0,0.0,221,1.0,0.019737,0.03871,456,0.0,0.0,0.0,756,0.588283,0.776072,0.669254,1889,0.0,0.0,0.0,96,0.533898,0.540773,0.537313,932,0.0,0.0,0.0,187,0.0,0.0,0.0,1,0.404339,0.48258,0.440009,2124,0.964912,0.135135,0.237069,407,0.212121,0.302594,0.249406,694,0.0,0.0,0.0,158,0.995074,0.781431,0.875406,517,0.611632,0.584229,0.597617,558,0.0,0.0,0.0,18,0.0,0.0,0.0,2,0.501316,0.443545,0.470664,0.62588,2.3966,168.992
40,0.8904,0.798071,0.977273,0.38914,0.556634,221,0.97043,0.791667,0.871981,456,0.437788,0.125661,0.195272,756,0.84063,0.904711,0.871494,1889,0.0,0.0,0.0,96,0.832753,0.769313,0.799777,932,0.0,0.0,0.0,187,0.0,0.0,0.0,1,0.528685,0.56403,0.545786,2124,0.577731,0.675676,0.622877,407,0.314911,0.435159,0.365396,694,0.0,0.0,0.0,158,0.946154,0.951644,0.948891,517,0.799672,0.87276,0.834619,558,0.0,0.0,0.0,18,0.0,0.0,0.0,2,0.681109,0.63465,0.657059,0.750093,2.3104,175.291
50,0.6446,0.580439,0.964029,0.606335,0.744444,221,0.953162,0.892544,0.921857,456,0.5168,0.427249,0.467777,756,0.931759,0.939651,0.935688,1889,0.0,0.0,0.0,96,0.902838,0.887339,0.895022,932,0.732824,0.513369,0.603774,187,0.0,0.0,0.0,1,0.60767,0.678908,0.641316,2124,0.636197,0.855037,0.72956,407,0.475269,0.318444,0.381363,694,0.859649,0.310127,0.455814,158,0.984645,0.992263,0.988439,517,0.887348,0.917563,0.902203,558,0.0,0.0,0.0,18,0.0,0.0,0.0,2,0.765519,0.737245,0.751116,0.813264,2.3243,174.248
60,0.5423,0.457513,0.954248,0.660633,0.780749,221,0.949045,0.980263,0.964401,456,0.544226,0.585979,0.564331,756,0.953183,0.959238,0.956201,1889,0.0,0.0,0.0,96,0.940331,0.91309,0.926511,932,0.848276,0.657754,0.740964,187,0.0,0.0,0.0,1,0.744544,0.706685,0.725121,2124,0.818182,0.884521,0.850059,407,0.48467,0.592219,0.533074,694,0.80916,0.670886,0.733564,158,0.967864,0.990329,0.978967,517,0.905498,0.944444,0.924561,558,0.0,0.0,0.0,18,0.0,0.0,0.0,2,0.810185,0.802906,0.806529,0.855409,2.3119,175.18


  _warn_prf(average, modifier, msg_start, len(result))


KeyboardInterrupt: 

In [None]:
trainer.evaluate(fine_tune_dataset['test'])

In [None]:
trainer.save_model('fine_tune_results/{}/best-checkpoint_f1'.format(TASK))

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs
