In [37]:
from datasets import DatasetDict, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, \
    DataCollatorWithPadding, EarlyStoppingCallback, set_seed
from pathlib import Path
import numpy as np
import evaluate
import torch
from pynvml import *
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import classification_report
import random
import argparse
import sys
import shutil
import pandas as pd

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()


def tokenize_function1(examples):
    return tokenizer(examples['premise'], truncation=True, max_length=128)

def tokenize_function2(examples):
    return tokenizer(examples['hypothesis'], truncation=True, max_length=128)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    print(logits)
    predictions = np.argmax(logits, axis=-1)
    scmetrics.add_batch(predictions=predictions, references=labels)
    return scmetrics.compute()


def create_labels(str_label):
    labels = []
    for s in str_label:
        if s == 'entailment':
            labels += [0]
        else:
            labels += [1]
    return labels

In [38]:
# parser = argparse.ArgumentParser(description='Sentence classification task')
# parser.add_argument('--model', help='Path to pt model and tokenizer')
# config = parser.parse_args(sys.argv[1:])
# task = 'sentiment'
# MODEL = f"cardiffnlp/twitter-roberta-base-{task}" #minerva: download model from hugging face and put in folder, update to path
MODEL = 'facebook/bart-large-mnli'

seed = 42
# set seed

# Create task Dataset from annotated samples
sentences = pd.read_csv('../data/MD-NLI.csv', header=0)

dataset = Dataset.from_pandas(sentences).rename_columns({'language': 'premise', "MD_label": 'sentiment', "label": 'str_label'})
dataset = dataset.add_column('label', create_labels(dataset['str_label']))
train_test = dataset.train_test_split(0.5, seed = seed)
dev_test = train_test['test'].train_test_split(0.5, seed = seed)
label_dt = DatasetDict({
    'train': train_test['train'],
    'dev': dev_test['train'],
    'test': dev_test['test']})

# print(label_dt)

tokenizer = AutoTokenizer.from_pretrained(MODEL)

tkn_dt = label_dt.map(tokenize_function1, batched=True, num_proc=4) # batched tokenizing activated
tkn_dt = label_dt.map(tokenize_function2, batched=True, num_proc=4)
# tkn_dt = tkn_dt.remove_columns(['']) # at some point we might need to delete sentiment column or else get an error

# data loader = allows us to use a chunk of the data at a time while training (or else computer crashes)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # read the sentences and pad them to create equal length vectors

Map (num_proc=4):   0%|          | 0/23 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/12 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/12 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/23 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/12 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/12 [00:00<?, ? examples/s]

In [39]:
set_seed(seed)
model = AutoModelForSequenceClassification.from_pretrained(MODEL,
                                                           num_labels=3,
                                                           from_tf=False)
if torch.cuda.is_available():
    model.to('cuda') # put the model on the gpu once, and then add a batch everytime when doing a training or evaluation loop
    print_gpu_utilization()

In [40]:
# Hyperparameters (for best configuration selection)
# 1st evaluate the hyper parameters once at a time, and select each hyper parameter that gives you the best score on results aka F1 vs recall
params = {
    'batch_size': [4],
    'epochs': [1], # 2, 5],
    'learning_rate': [5e-6], # 1e-5, 2e-5, 5e-5, 1e-4],
    'weight_decay': [0], # 0.01, 0.1], # how much the weight change is shrinking
    'warmup_ratio': [0], # 0.01, 0.1] # ratio of examples it takes to get ready for the learning rate
} 

metrics_file = f'classification_metrics_MD.csv'
if os.path.isfile(metrics_file):
    f = open(metrics_file, 'a')
else:
    f = open(metrics_file, 'w')
    f.write('batch_size,epochs,learning_rate,weight_decay,warmup_ratio,loss,f1,precision,recall\n')

best_model = []
best_f1 = 0.0
tmp_trainer, tmp_comb = None, None
for comb in list(ParameterGrid(params)):
    print(f"Parameters: {comb}")
    training_args = TrainingArguments(
        output_dir=f'runs/MD',
        evaluation_strategy='epoch', # every epoch the model is evaluated and checkpoint is made saving the weights
        eval_steps=1, # check, probably each step is by epoch
        logging_strategy='epoch',
        weight_decay=comb['weight_decay'],
        warmup_ratio=comb['warmup_ratio'],
        num_train_epochs=comb['epochs'],
        learning_rate=comb['learning_rate'],
        per_device_train_batch_size=comb['batch_size'],
        per_device_eval_batch_size=comb['batch_size'],
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='eval_f1',
        seed=seed,
        data_seed=seed)
    scmetrics = evaluate.load("../scmetrics")

    trainer = Trainer(model=model,
                      args=training_args,
                      callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], # if loss isnt decreasing for 2 epochs then it stops training
                      train_dataset=tkn_dt['train'],
                      eval_dataset=tkn_dt['dev'],
                      compute_metrics=compute_metrics,
                      data_collator=data_collator)
    results = trainer.train()
    results_eval = trainer.evaluate()

    v = [comb['batch_size'], comb['epochs'], comb['learning_rate'], comb['weight_decay'], comb['warmup_ratio'],
    results.metrics['train_loss'], results_eval['eval_f1'], results_eval['eval_precision'], results_eval['eval_recall']]
    f.write(','.join([str(el) for el in v]) + '\n')

    if results_eval['eval_f1'] > best_f1:
        best_f1 = results_eval['eval_f1']
        tmp_trainer = trainer
        tmp_comb = comb
    print('-' * 100)
    print('\n\n')

# Error analysis step
labels_to_sen = {0: 'neutral', 1: 'negative', 2: 'positive'}
if tmp_trainer is not None:
    best_trainer = tmp_trainer
    best_comb = tmp_comb
    print(f'Best parameters configuration: {best_comb}')
    dev_pred = best_trainer.predict(tkn_dt['test'])
    pred = np.argmax(dev_pred.predictions, axis=-1)
    pred_score = np.max(torch.nn.functional.softmax(torch.tensor(dev_pred.predictions), dim=-1).numpy(), axis=-1)
    i = 0
    errors = {'FP': [], 'FN': []}
    for pred_lab, true_lab in zip(pred, dev_pred.label_ids):
        if pred_lab != true_lab:
            if pred_lab > 1:
                errors['FP'].append((
                    tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(tkn_dt['test']['input_ids'][i])),
                    pred_score[i], labels_to_sen[pred_lab], labels_to_sen[true_lab]))
            else:
                errors['FN'].append((tokenizer.convert_tokens_to_string(
                    tokenizer.convert_ids_to_tokens(tkn_dt['test']['input_ids'][i])), pred_score[i],
                                     labels_to_sen[pred_lab], labels_to_sen[true_lab]))
        i += 1
    with open(f'error_analysis_MD.tsv',
              'w') as f:
        f.write('sentence\tpredicted_label\ttrue_label\tprobability\n')
        for k, vect in errors.items():
            if k == 'FP':
                for sen in vect:
                    f.write(sen[0] + '\t' + f'PRED_{sen[2].upper()}' + '\t' + f'TRUE_{sen[3].upper()}' + '\t' + str(
                        sen[1]) + '\n')
                f.write('\n')
            else:
                for sen in vect:
                    f.write(sen[0] + '\t' + f'PRED_{sen[2].upper()}' + '\t' + f'TRUE_{sen[3].upper()}' + '\t' + str(
                        sen[1]) + '\n')
    test_pred = best_trainer.predict(tkn_dt['test'])
    print(test_pred.metrics)

    model_dir = f'runs/MD'
    for d in os.listdir(model_dir):
        # This removes the checkpoints (comment it if you want to keep them)
        if 'checkpoint' in d:
            shutil.rmtree(os.path.join(model_dir, d))
    best_trainer.save_model(
        output_dir=f'best_model/MD')
else:
    print("Precision is 0.0 change something in your model's configuration and retry.")
f.close()

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Parameters: {'batch_size': 4, 'epochs': 1, 'learning_rate': 5e-06, 'warmup_ratio': 0, 'weight_decay': 0}


Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



Epoch,Training Loss,Validation Loss


(array([[ 0.91953677,  0.8460272 , -2.0337613 ],
       [ 0.9632044 ,  0.5826386 , -1.932817  ],
       [ 0.9632041 ,  0.5826388 , -1.9328166 ],
       [ 0.9632044 ,  0.5826386 , -1.932817  ],
       [ 0.91953677,  0.8460272 , -2.0337613 ],
       [ 0.80543125,  0.90705234, -1.8053453 ],
       [ 0.80543137,  0.90705246, -1.8053449 ],
       [ 0.9632044 ,  0.5826386 , -1.932817  ],
       [ 0.80543137,  0.90705246, -1.8053449 ],
       [ 0.9195366 ,  0.8460272 , -2.0337617 ],
       [ 0.80543137,  0.90705246, -1.8053449 ],
       [ 0.9632044 ,  0.5826386 , -1.932817  ]], dtype=float32), array([[[ 7.38023128e-03,  3.47783789e-02,  2.32912097e-02, ...,
          6.59265649e-03,  1.00501385e-02,  6.89580571e-04],
        [-1.18490063e-01, -2.23131254e-01, -2.65620619e-01, ...,
         -1.38534203e-01,  1.13098696e-01, -1.86566785e-01],
        [ 3.92207317e-02, -1.47638544e-01, -8.02371576e-02, ...,
          2.10778806e-02,  8.07959437e-02, -6.27166405e-02],
        ...,
        [ 7.621

  result = getattr(asarray(obj), method)(*args, **kwds)


ValueError: could not broadcast input array from shape (12,3) into shape (12,)