In [1]:
MODEL_PATH = 'distilbert-base-uncased'

# !transformers-cli convert --model_type bert \
#   --tf_checkpoint $MODEL_PATH/model.ckpt-100000 \
#   --config $MODEL_PATH/config.json \
#   --pytorch_dump_output $MODEL_PATH/pytorch_model.bin

In [2]:
import torch
import pandas as pd
import numpy as np
from datasets import load_metric
from sklearn.model_selection import train_test_split
from transformers import Trainer, TrainingArguments,AutoModelForQuestionAnswering,AutoModelForSequenceClassification,AutoModelForTokenClassification,AutoTokenizer
from transformers.trainer_utils import IntervalStrategy
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
DATA_ROOT = '../glue-urdu/'
TASK = 'NER'
task_params = {
    'NER':{'entity_metrics':True,'batch_size':64, 'epochs':3},
    'NLI':[],
    'POS':[],
    'QuAD':{'batch_size':32, 'epochs':3},
    'SentiMix':{'batch_size':64, 'epochs':3}
}
type_datasets=['train','validation','test']
assert TASK in task_params.keys()
hyperparams = task_params[TASK]
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
if TASK == 'QuAD':
    model = AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH)
elif TASK == 'NER':
    model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH,num_labels=7)
else:
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH, num_labels=3)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN t

In [4]:
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if TASK != 'QuAD':
            item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

def answer_to_idx(answers, contexts):
    result = []
    for answer, context in zip(answers, contexts):
        start_idx = int(answer[0])
        end_idx = start_idx + len(answer[1])

        for i in range(1, 3):
            if context[start_idx - i:end_idx - i+1] == answer[1]:
                start_idx -= i
                end_idx -= i
                break
        result.append([start_idx, end_idx, (start_idx,context[start_idx:end_idx+1])])

    return result

def char_to_token_position(encodings, answers,dataset_type):
    start_positions = []
    end_positions = []
    answer_texts = []
    for i,answer in enumerate(answers):
        start_positions.append(dataset_type)
        end_positions.append(i)
        answer_texts.append((answer[2],encodings['input_ids'][i]))

    return dict(encodings, **{'start_positions': start_positions, 'end_positions': end_positions}),answer_texts

def encode_tags(tags, encodings, unique_tags):
    labels = [[unique_tags.index(tag) for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        # create an empty array of -100
        doc_enc_labels = np.ones(len(doc_offset), dtype=int) * -100
        arr_offset = np.array(doc_offset)

        # set labels whose first offset position is 0 and the second is not 0
        if sum(arr_offset[-2]) != 0:
            sub_len = sum((arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0))
        else:
            sub_len = len(doc_labels)
        doc_enc_labels[(arr_offset[:, 0] == 0) & (arr_offset[:, 1] != 0)] = doc_labels[:sub_len]
        encoded_labels.append(doc_enc_labels.tolist())

    return encoded_labels

def getSentiMix(path):
    senti_mix_train = pd.read_csv(path+'SentiMix/Roman Urdu/SentiMix.train.ru.csv')
    senti_mix_test = pd.read_csv(path+'SentiMix/Roman Urdu/SentiMix.test.ru.csv')
    sentiment_categorical = senti_mix_train['sentiment'].astype('category').cat
    class_names = list(sentiment_categorical.categories)

    sentences_train = list(senti_mix_train.sentence)
    labels_train = list(sentiment_categorical.codes)

    X_test = list(senti_mix_test.sentence)
    y_test = list(senti_mix_test['sentiment'].astype('category').cat.codes)

    X_train, X_val, y_train, y_val = train_test_split(sentences_train, labels_train, test_size=0.1)
    encodings = []
    for data in [X_train,X_val,X_test]:
        encodings.append(tokenizer(data,padding='max_length', truncation=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=128))

    return {'train': CustomDataset(encodings[0],y_train),
            'validation': CustomDataset(encodings[1],y_val),
            'test': CustomDataset(encodings[2],y_test),
            'classes':class_names}

def getNLI(path):
    data_dict = {}
    for i in ['train','dev','test']:
        dataframe = pd.read_csv(path+'NLI/Roman Urdu/NLI.ru.{}.tsv'.format(i),sep='\t')
        sentences = dataframe[['premise','hypo']].to_numpy()
        categorical = dataframe['Label'].astype('category').cat
        labels = list(categorical.codes)
        data = list(map(str.strip,sentences[:,0])),list(map(str.strip,sentences[:,1]))
        encodings = tokenizer(*data,padding='max_length', truncation=True, add_special_tokens = True, return_attention_mask = True, return_tensors = "pt", max_length=128)
        data_dict[i] = CustomDataset(encodings,labels)
        if i == 'train':
            data_dict['classes'] = list(categorical.categories)

    return data_dict

def getNER(path):
    token_tags = []
    for split_type in ['train', 'test']:
        raw_docs = open(path + 'NER/Roman Urdu/NER.ru.{}'.format(split_type)).read().strip().split('\n\n')
        processed_docs = [list(zip(*[token_tag.split('\t') for token_tag in doc.split('\n')])) for doc in raw_docs]
        token_tags.append(list(zip(*processed_docs)))

    unique_tags = list(set(tag for doc in token_tags[0][1] + token_tags[1][1] for tag in doc))
    assert len(unique_tags) == 7
    X_train, X_val, y_train, y_val = train_test_split(*token_tags[0], test_size=0.1)
    X_test, y_test = token_tags[1]

    encodings = []
    labels = []
    for data, label in [(X_train, y_train), (X_val, y_val), (X_test, y_test)]:
        encodings.append(
            tokenizer(list(data), is_split_into_words=True, return_offsets_mapping=True, padding='max_length',
                      truncation=True, add_special_tokens=True, return_attention_mask=True,
                      return_tensors="pt", max_length=128))
        labels.append(encode_tags(label, encodings[-1], unique_tags))
        encodings[-1].pop("offset_mapping")

    CustomDataset.label_list = unique_tags
    return {'train': CustomDataset(encodings[0], labels[0]),
            'validation': CustomDataset(encodings[1], labels[1]),
            'test': CustomDataset(encodings[2], labels[2])}

def getPOS(path):
    return

def getQuAD(path):
    dataframe = pd.read_csv(path + 'QuAD/Roman Urdu/QuAD.ru.csv', sep=r"\s\|\s", engine='python')
    sentences = dataframe[["paragraph", "question"]].to_numpy()
    answers = dataframe[["answer starting idx", "answer"]].to_numpy()

    y = answer_to_idx(answers, sentences[:, 0])

    X_train, X_test, y_train, y_test = train_test_split(sentences, y, test_size=0.2, random_state=1)
    X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.125,random_state=1)  # 0.125 * 0.8 = 0.1

    encodings = []
    for data in [X_train, X_val, X_test]:
        encodings.append(
            tokenizer(list(data[:, 0]), list(data[:, 1]), padding='max_length', truncation='only_first', add_special_tokens=True,
                      return_attention_mask=True,
                      return_tensors="pt", max_length=128))

    return {'train': CustomDataset(*char_to_token_position(encodings[0], y_train,0)),
            'validation': CustomDataset(*char_to_token_position(encodings[1], y_val,1)),
            'test': CustomDataset(*char_to_token_position(encodings[2], y_test,2))}


In [5]:
fine_tune_dataset = locals()['get'+TASK](DATA_ROOT)
if TASK == 'QuAD':
    metric = load_metric("squad")
elif TASK == 'NER':
    metric = load_metric("seqeval")

def compute_metrics(pred):
    if TASK == 'QuAD':
        answer_start_ids = np.argmax(pred.predictions[0],axis=-1)
        answer_end_ids = np.argmax(pred.predictions[1],axis=-1)
        labels = fine_tune_dataset[type_datasets[pred.label_ids[0][0]]].labels
        predictions=[]
        references=[]
        ids = pred.label_ids[1]
        for i,_id in enumerate(ids):
            true_answer,encodings = labels[_id]
            predictions.append({'id':str(_id),'prediction_text':tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(encodings[answer_start_ids[i]:answer_end_ids[i]]))})
            references.append({'id':str(_id),'answers':{'answer_start':[true_answer[0]],'text':[true_answer[1]]}})

        return metric.compute(predictions=predictions, references=references)
    elif TASK == 'NER':
        predictions, labels = pred
        predictions = np.argmax(predictions, axis=2)

        # Remove ignored index (special tokens)
        true_predictions,true_labels = [
            list(zip(*[(CustomDataset.label_list[p],CustomDataset.label_list[l]) for (p, l) in zip(prediction, label) if l != -100]))
            for prediction, label in zip(predictions, labels)
        ]

        results = metric.compute(predictions=true_predictions, references=true_labels)
        if hyperparams['entity_metrics']:
            # Unpack nested dictionaries
            final_results = {}
            for key, value in results.items():
                if isinstance(value, dict):
                    for n, v in value.items():
                        final_results[f"{key}_{n}"] = v
                else:
                    final_results[key] = value
            return final_results
        else:
            return {
                "precision": results["overall_precision"],
                "recall": results["overall_recall"],
                "f1": results["overall_f1"],
                "accuracy": results["overall_accuracy"],
            }
    else:
        labels = pred.label_ids
        preds = pred.predictions.argmax(-1)
        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
        acc = accuracy_score(labels, preds)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }

training_args = TrainingArguments(
    output_dir='fine_tune_results/{}'.format(TASK),  # output directory
    overwrite_output_dir=True,
    num_train_epochs=hyperparams['epochs'],  # total number of training epochs
    per_device_train_batch_size=hyperparams['batch_size'],  # batch size per device during training
    per_device_eval_batch_size=hyperparams['batch_size'],  # batch size for evaluation
    warmup_steps=60,  # number of warmup steps for learning rate scheduler
    weight_decay=0.01,  # strength of weight decay
    logging_dir='./logs',  # directory for storing logs
    logging_steps=1,
    evaluation_strategy=IntervalStrategy.STEPS,
    eval_steps = 1,
    load_best_model_at_end=True,
    metric_for_best_model='eval_f1'
)

trainer = Trainer(
    model=model,  # the instantiated 🤗 Transformers model to be trained
    args=training_args,  # training arguments, defined above
    train_dataset=fine_tune_dataset['train'],  # training dataset
    eval_dataset=fine_tune_dataset['validation'],  # evaluation dataset
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)


In [6]:
trainer.train()

  import sys


Step,Training Loss,Validation Loss


AttributeError: 'NoneType' object has no attribute 'f_code'

In [None]:
trainer.evaluate(fine_tune_dataset['test'])

In [None]:
trainer.save_model('fine_tune_results/{}/best-checkpoint_f1'.format(TASK))

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs
