In [1]:
MODEL_PATH = 'multilingual_vocab_extension'

# !transformers-cli convert --model_type bert \
#   --tf_checkpoint $MODEL_PATH/model.ckpt-100000 \
#   --config $MODEL_PATH/config.json \
#   --pytorch_dump_output $MODEL_PATH/pytorch_model.bin

In [2]:
import math
import torch
import pprint
import random
import collections
import numpy as np
import pandas as pd
from operator import itemgetter
from datasets import load_metric
from sklearn.model_selection import train_test_split
from transformers.trainer_utils import IntervalStrategy
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import Trainer, TrainingArguments,DataCollatorWithPadding,DataCollatorForTokenClassification,BertForQuestionAnswering,BertForSequenceClassification,BertForTokenClassification,BertTokenizerFast

In [3]:
DATA_ROOT = '../glue-urdu/'
TASK = 'SentiMix'
SAVE_PATH = 'fine_tune_results/{}/{}'.format(TASK,MODEL_PATH)
task_params = {
    'NER':{'best_model_metric':'eval_f1','entity_metrics':False,'batch_size':30, 'epochs':6, 'eval_steps':50 },
    'NLI':{'best_model_metric':'eval_f1','batch_size':30, 'epochs':5, 'eval_steps':100},
    'POS':{'best_model_metric':'eval_f1','entity_metrics':False,'batch_size':30, 'epochs':30, 'eval_steps':10 },
    'QuAD':{'best_model_metric':'eval_f1','batch_size':30, 'epochs':10, 'eval_steps':10},
    'SentiMix':{'best_model_metric':'eval_f1','batch_size':30, 'epochs':5, 'eval_steps':50}
}
max_length = 128 # The maximum length of a feature (question and context)
doc_stride = 32 # The authorized overlap between two part of the context when splitting it is needed.
assert TASK in task_params.keys()
hyperparams = task_params[TASK]
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
print("Training on: ",device)

Training on:  cuda


In [4]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels,tokenizer_fn=None):
        self.encodings = encodings
        self.labels = labels
        self.tokenizer_fn = tokenizer_fn

    def __getitem__(self, idx):
        if TASK == 'NLI':
            item = {key: torch.squeeze(val) for key, val in self.tokenizer_fn(self.encodings[idx]).items()}
            # item = {key: torch.squeeze(val) for key, val in self.tokenizer_fn(self.encodings[idx]).items()}
        else:
            item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

        if TASK == 'POS' or TASK == 'NER':
            item['labels'] = self.labels[idx]
        elif TASK != 'QuAD':
            item['labels'] = torch.tensor(self.labels[idx])

        return item

    def __len__(self):
        if TASK == 'NLI':
            return len(self.labels)
        else:
            return len(self.encodings['input_ids'])

def random_bias(from_num,to_num,high=True):
    return math.floor(abs(int(high) - abs(random.random() - random.random())) * (1 + to_num - from_num) + from_num)

def preprocess_data(sentences, answers, max_untokenized_len = 100):
    X=[]
    y=[]
    corrected_indices = 0
    for sentence,answer in zip(sentences,answers):
        start_idx = sentence[1].index(answer[1])
        end_idx = start_idx + len(answer[1])
        corrected_indices += start_idx != int(answer[0])
        assert sentence[1][start_idx:end_idx] == answer[1]
        if len(answer[1].split())+len(sentence[0].split()) < max_untokenized_len:
            if len(sentence[0].split())+len(sentence[1].split()) < max_untokenized_len:
                X.append((sentence[0],sentence[1]))
                y.append(((start_idx,answer[1]),sentence[1]))
            else:
                start_extra_len = len(sentence[1][:start_idx].split())
                end_extra_len =  len(sentence[1][end_idx:].split())
                while True:
                    random_start = random_bias(0,start_extra_len)
                    random_end = random_bias(0,end_extra_len)
                    if random_start+random_end+len(answer[1].split()) < max_untokenized_len:
                        new_start = start_extra_len - random_start
                        new_end = len(sentence[1][:end_idx].split()) + random_end
                        X.append([sentence[0],' '.join(sentence[1].split()[new_start:new_end])])
                        y.append([(X[-1][1].index(answer[1]), answer[1]), X[-1][1]])
                        assert X[-1][1][y[-1][0][0]:y[-1][0][0]+len(answer[1])] == answer[1]
                        break


    return np.array(X),np.array(y)

def prepare_features(data, answers):
    # Tokenize our examples with truncation and padding, but keep the overflows using a stride. This results
    # in one example possible giving several features when a context is long, each of those features having a
    # context that overlaps a bit the context of the previous feature.
    tokenized_examples = tokenizer(
        list(data[:, 0]),
        list(data[:, 1]),
        padding='max_length',
        truncation='only_second',
        add_special_tokens=True,
        return_attention_mask=True,
        return_tensors="pt",
        max_length=max_length,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        stride=doc_stride
    )

    # Since one example might give us several features if it has a long context, we need a map from a feature to
    # its corresponding example. This key gives us just that.
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    # The offset mappings will give us a map from token to character position in the original context. This will
    # help us compute the start_positions and end_positions.
    offset_mapping = tokenized_examples.pop("offset_mapping")

    # Let's label those examples!
    tokenized_examples["start_positions"] = []
    tokenized_examples["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        # We will label impossible answers with the index of the CLS token.
        input_ids = tokenized_examples["input_ids"][i]
        cls_index = list(input_ids).index(tokenizer.cls_token_id)

        # Grab the sequence corresponding to that example (to know what is the context and what is the question).
        sequence_ids = tokenized_examples.sequence_ids(i)

        # One example can give several spans, this is the index of the example containing this span of text.
        sample_index = sample_mapping[i]
        answer = answers[sample_index]
        # If no answers are given, set the cls_index as answer.
        if len(answer[0][1]) == 0:
            tokenized_examples["start_positions"].append(cls_index)
            tokenized_examples["end_positions"].append(cls_index)
        else:
            # Start/end character index of the answer in the text.
            start_char = answer[0][0]
            end_char = start_char + len(answer[0][1])

            # Start token index of the current span in the text.
            token_start_index = sequence_ids.index(1)

            offset_mapping[i][:token_start_index] = torch.tensor([[-1] * 2] * token_start_index)

            # End token index of the current span in the text.
            token_end_index = len(sequence_ids) - 1 - sequence_ids[::-1].index(1)

            # Detect if the answer is out of the span (in which case this feature is labeled with the CLS index).
            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                tokenized_examples["start_positions"].append(cls_index)
                tokenized_examples["end_positions"].append(cls_index)
            else:
                # Otherwise move the token_start_index and token_end_index to the two ends of the answer.
                # Note: we could go after the last offset if the answer is the last word (edge case).
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                tokenized_examples["start_positions"].append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                tokenized_examples["end_positions"].append(token_end_index + 1)

    features_dict = {
        'offset_mapping':offset_mapping,
        'example_id': sample_mapping.numpy(),
        'input_ids':tokenized_examples['input_ids']
    }
    return tokenized_examples, (answers, features_dict )

def encode_tags(tags, encodings, unique_tags):
    labels = [[unique_tags.index(tag) for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset), dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        if sum(arr_offset[-2]) != 0:
            sub_len = sum((arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0))
        else:
            sub_len = len(doc_labels)
        doc_enc_labels[(arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)] = doc_labels[:sub_len]
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

def common_NER_POS(token_tags,num_tags):
    unique_tags = list(set(tag for doc in token_tags[0][1] + token_tags[1][1] for tag in doc))
    assert len(unique_tags) == num_tags
    X_train, X_val, y_train, y_val = train_test_split(*token_tags[0], test_size=0.1)
    X_test, y_test = token_tags[1]

    encodings = []
    labels = []
    for data, label in [(X_train, y_train), (X_val, y_val), (X_test, y_test)]:
        encodings.append(
            tokenizer(list(data), is_split_into_words=True, return_offsets_mapping=True, padding='max_length',
                      truncation=True, add_special_tokens=True, return_attention_mask=True,
                      return_tensors="pt", max_length=max_length))
        labels.append(encode_tags(label, encodings[-1], unique_tags))
        encodings[-1].pop("offset_mapping")

    CustomDataset.label_list = unique_tags
    return {'train': CustomDataset(encodings[0], labels[0]),
            'dev': CustomDataset(encodings[1], labels[1]),
            'test': CustomDataset(encodings[2], labels[2])}

def getSentiMix(path):
    senti_mix_train = pd.read_csv(path+'SentiMix/Roman Urdu/SentiMix.train.ru.csv')
    senti_mix_test = pd.read_csv(path+'SentiMix/Roman Urdu/SentiMix.test.ru.csv')
    sentiment_categorical = senti_mix_train['sentiment'].astype('category').cat
    class_names = list(sentiment_categorical.categories)

    sentences_train = list(senti_mix_train.sentence)
    labels_train = list(sentiment_categorical.codes)

    X_test = list(senti_mix_test.sentence)
    y_test = list(senti_mix_test['sentiment'].astype('category').cat.codes)

    X_train, X_val, y_train, y_val = train_test_split(sentences_train, labels_train, test_size=0.1)
    encodings = []
    for data in [X_train,X_val,X_test]:
        encodings.append(tokenizer(data,padding='max_length', truncation=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=max_length))

    return {'train': CustomDataset(encodings[0],y_train),
            'dev': CustomDataset(encodings[1],y_val),
            'test': CustomDataset(encodings[2],y_test),
            'classes':class_names}

def getNLI(path):
    data_dict = {}
    for i in ['train','dev','test']:
        dataframe = pd.read_csv(path+'NLI/Roman Urdu/NLI.ru.{}.tsv'.format(i),sep='\t')
        sentences = dataframe[['premise','hypo']].to_numpy()
        categorical = dataframe['Label'].astype('category').cat
        labels = list(categorical.codes)
        data = [tuple(map(str.strip,sentence)) for sentence in sentences]
        tokenizer_fn = lambda x: tokenizer(*x,padding='max_length', truncation=True, add_special_tokens = True, return_attention_mask = True,return_tensors = "pt", max_length=max_length)
        data_dict[i] = CustomDataset(data,labels,tokenizer_fn)
        if i == 'train':
            data_dict['classes'] = list(categorical.categories)

    return data_dict

def getNER(path):
    token_tags = []
    for split_type in ['train', 'test']:
        raw_docs = open(path + 'NER/Roman Urdu/NER.ru.{}'.format(split_type)).read().strip().split('\n\n')
        processed_docs = [list(zip(*[token_tag.split('\t') for token_tag in doc.split('\n')])) for doc in raw_docs]
        token_tags.append(list(zip(*processed_docs)))

    return common_NER_POS(token_tags,7)

def getPOS(path):
    token_tags = []
    for split_type in ['train','dev' ,'test']:
        raw_docs = open(path + 'POS/Roman Urdu/pos.ru.{}.conllu'.format(split_type)).read().strip().split('\n\n')
        processed_docs = [list(zip(*[itemgetter(1,3)(token_tag.split('\t')) for token_tag in doc.split('\n')[2:]])) for doc in raw_docs]
        token_tags.append(list(zip(*processed_docs)))

    return common_NER_POS(token_tags,17)

def getQuAD(path):
    assert tokenizer.padding_side == "right"
    dataframe = pd.read_csv(path + 'QuAD/Roman Urdu/QuAD.ru.csv', sep=r"\s\|\s", engine='python')
    sentences = dataframe[["question", "paragraph"]].to_numpy()
    answers = dataframe[["answer starting idx", "answer"]].to_numpy()

    X,y = preprocess_data(sentences,answers)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125,
                                                      random_state=1)  # 0.125 * 0.8 = 0.1

    datasets = {}
    for data, answer, split in zip([X_train, X_val, X_test], [y_train, y_val, y_test], ['train', 'dev', 'test']):
        datasets[split] = CustomDataset(*prepare_features(data, answer))

    return datasets


In [5]:
fine_tune_dataset = locals()['get'+TASK](DATA_ROOT)
eval_dataset = fine_tune_dataset['dev']
if TASK == 'QuAD':
    metric = load_metric("squad")
elif TASK == 'NER' or TASK == 'POS':
    metric = load_metric("seqeval")

if TASK == 'QuAD':
    model = BertForQuestionAnswering.from_pretrained(MODEL_PATH)
elif TASK == 'NER' or TASK == 'POS':
    model = BertForTokenClassification.from_pretrained(MODEL_PATH,num_labels=len(CustomDataset.label_list))
else:
    model = BertForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=3)

def compute_squad_metrics(pred):
    n_best_size = 20
    all_start_logits, all_end_logits = pred.predictions
    examples, features = eval_dataset.labels
    # Build a map example to its corresponding features.
    features_per_example = collections.defaultdict(list)
    for i, example_id in enumerate(features["example_id"]):
        features_per_example[example_id].append(i)

    # The dictionaries we have to fill.
    predictions = []
    references = []

    # Let's loop over all the examples!
    for example_index, example in enumerate(examples):
        # Those are the indices of the features associated to the current example.
        feature_indices = features_per_example[example_index]

        min_null_score = None  # Only used if squad_v2 is True.
        valid_answers = []

        context = example[1]
        # Looping through all the features associated to the current example.
        for feature_index in feature_indices:
            # We grab the predictions of the model for this feature.
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            # This is what will allow us to map some the positions in our logits to span of texts in the original
            # context.
            offset_mapping = features["offset_mapping"][feature_index]

            # Update minimum null prediction.
            cls_index = list(features["input_ids"][feature_index]).index(tokenizer.cls_token_id)
            feature_null_score = start_logits[cls_index] + end_logits[cls_index]
            if min_null_score is None or min_null_score < feature_null_score:
                min_null_score = feature_null_score

            # Go through all possibilities for the `n_best_size` greater start and end logits.
            start_indexes = np.argsort(start_logits)[-1: -n_best_size - 1: -1].tolist()
            end_indexes = np.argsort(end_logits)[-1: -n_best_size - 1: -1].tolist()
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Don't consider out-of-scope answers, either because the indices are out of bounds or correspond
                    # Don't consider answers with a length that is either < 0
                    # to part of the input_ids that are not in the context.
                    if offset_mapping[start_index][0] == -1 or end_index < start_index:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    valid_answers.append(
                        {
                            "score": start_logits[start_index] + end_logits[end_index],
                            "text": context[start_char: end_char]
                        }
                    )

        if len(valid_answers) > 0:
            best_answer = sorted(valid_answers, key=lambda x: x["score"], reverse=True)[0]
        else:
            # In the very rare edge case we have not a single non-null prediction, we create a fake prediction to avoid
            # failure.
            best_answer = {"text": "", "score": 0.0}

        predictions.append({'id':str(example_index),'prediction_text': best_answer["text"]})
        references.append({'id':str(example_index),'answers':{'answer_start':[example[0][0]],'text':[example[0][1]]}})

    return metric.compute(predictions=predictions, references=references)

def compute_metrics(pred):
    if TASK == 'QuAD':
        return compute_squad_metrics(pred)
    elif TASK == 'NER' or TASK == 'POS':
        predictions, labels = pred
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions,true_labels = list(zip(*[zip(*map(lambda p_l:itemgetter(*p_l)(CustomDataset.label_list),
                                                           filter(lambda p_l: p_l[1] != -100, zip(prediction,label))))
                                                  for prediction, label in zip(predictions,labels)]))

        results = metric.compute(predictions=true_predictions, references=true_labels)
        if hyperparams['entity_metrics']:
            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }
    else:
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

training_args = TrainingArguments(
    output_dir=SAVE_PATH,  # output directory
    overwrite_output_dir=True,
    num_train_epochs=hyperparams['epochs'],  # total number of training epochs
    per_device_train_batch_size=hyperparams['batch_size'],  # batch size per device during training
    per_device_eval_batch_size=hyperparams['batch_size'],  # batch size for evaluation
    warmup_steps=60,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=1,
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps = hyperparams['eval_steps'],
    save_total_limit = 10,
    load_best_model_at_end=True,
    metric_for_best_model=hyperparams['best_model_metric']
)

trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=fine_tune_dataset['train'],  # training dataset
    eval_dataset=fine_tune_dataset['dev'],  # evaluation dataset
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
    data_collator=DataCollatorForTokenClassification(tokenizer) if TASK == 'NER' or TASK == 'POS' else DataCollatorWithPadding(tokenizer)
)


Some weights of the model checkpoint at multilingual_vocab_extension were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were n

In [None]:
trainer.train()

  if sys.path[0] == '':


Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall,Runtime,Samples Per Second
50,0.8624,0.873601,0.588235,0.590867,0.5997,0.588235,7.7595,219.086
100,0.8514,0.840098,0.612353,0.600394,0.608825,0.612353,7.7471,219.438
150,0.9151,0.814445,0.632353,0.626287,0.634701,0.632353,7.7522,219.292
200,0.643,0.813421,0.627647,0.625098,0.629326,0.627647,7.8418,216.786
250,0.8528,0.800295,0.628824,0.629303,0.663803,0.628824,7.8562,216.39
300,0.7658,0.809333,0.636471,0.632213,0.634134,0.636471,7.8524,216.495
350,0.7691,0.782437,0.640588,0.641391,0.643757,0.640588,7.8075,217.741
400,0.624,0.806381,0.625294,0.613414,0.622922,0.625294,7.7219,220.154
450,0.7134,0.794681,0.64,0.632249,0.63395,0.64,7.7433,219.546
500,0.781,0.794565,0.645882,0.641005,0.642133,0.645882,7.7466,219.451


  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':


  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':


In [None]:
eval_dataset = fine_tune_dataset['test']
hyperparams['entity_metrics'] = True
test_results=trainer.evaluate(fine_tune_dataset['test'])
test_results

In [None]:
eval_dataset = fine_tune_dataset['dev']
validation_results = trainer.evaluate(fine_tune_dataset['dev'])
validation_results

In [None]:
FINAL='/best-checkpoint'
trainer.save_model(SAVE_PATH+FINAL)
with open(SAVE_PATH+FINAL+'/test_results.txt','w') as f:
    f.write(pprint.pformat(test_results))
with open(SAVE_PATH+FINAL+'/validation_results.txt','w') as f:
    f.write(pprint.pformat(validation_results))

In [None]:
!rm -r $SAVE_PATH/checkpoint-*

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs
