In [6]:
import pandas as pd
import numpy as np
import torch
from datasets import DatasetDict, Dataset, load_metric
import random
from transformers import BartTokenizerFast, BartForSequenceClassification, Trainer, TrainingArguments, \
EvalPrediction, pipeline, set_seed, DataCollatorWithPadding, EarlyStoppingCallback
import os
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import classification_report
import argparse
import sys
import shutil
import evaluate

seed = 42
set_seed(seed)

MODEL = 'facebook/bart-large-mnli'
tokenizer = BartTokenizerFast.from_pretrained(MODEL)

def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used // 1024 ** 2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

    
def create_input_sequence(sample):
    text = sample["text"]
    label = sample["class"][0]
    contradiction_label = random.choice([x for x in label_to_int if x != label])
    encoded_sequence = tokenizer(text * 2, [template.format(label), template.format(contradiction_label)], truncation = True, padding = 'max_length')
    encoded_sequence["labels"] = [1, 0]
    encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)
    return encoded_sequence


def compute_metrics(p: EvalPrediction):
    metric_acc = load_metric("accuracy")
    metric_f1 = load_metric("f1")
    metric_precision = load_metric("precision")
    metric_recall = load_metric("recall")
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis = 1)
    result = {}
    result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
    result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, pos_label=1, average = "weighted")["f1"] #play with "weighted" "micro" "macro"
    result["precision"] = metric_precision.compute(predictions = preds, references = p.label_ids, pos_label=1, average="weighted", sample_weight=None, zero_division='warn')["precision"]
    result["recall"] = metric_recall.compute(predictions = preds, references = p.label_ids, pos_label=1, average="weighted", sample_weight=None, zero_division='warn')["recall"]
    c = classification_report(predictions = preds, references = p.label_ids, labels=None)
    print(c)
    return result

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     print(logits)
#     predictions = np.argmax(logits, axis=-1)
#     scmetrics.add_batch(predictions=predictions, references=labels)
#     return scmetrics.compute()

In [2]:
# parser = argparse.ArgumentParser(description='Sentence classification task')
# parser.add_argument('--model', help='Path to pt model and tokenizer')
# config = parser.parse_args(sys.argv[1:])
# task = 'sentiment'
# MODEL = f"cardiffnlp/twitter-roberta-base-{task}" #minerva: download model from hugging face and put in folder, update to path

df = pd.read_csv('../data/MD-NLI.csv', header=0)
df = df[["language", "MD_label"]]
dataset = Dataset.from_pandas(df).rename_columns({'language': 'text', "MD_label": 'class'})
# label_dt = dataset.train_test_split(0.5, seed = seed)
train_test = dataset.train_test_split(0.35, seed = seed)
dev_test = train_test['test'].train_test_split(0.5, seed = seed)
label_dt = DatasetDict({
    'train': train_test['train'],
    'dev': dev_test['train'],
    'test': dev_test['test']})
# print(label_dt)

label_to_int = ["neutral", "negative", "positive"]
template = "The sentiment of this sentence is {}"
label_dt = label_dt.map(create_input_sequence, batched = True, batch_size = 1, remove_columns = ["class", "text"])

# data loader = allows us to use a chunk of the data at a time while training (or else computer crashes)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer) # read the sentences and pad them to create equal length vectors

Map:   0%|          | 0/30 [00:00<?, ? examples/s]

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/9 [00:00<?, ? examples/s]

In [3]:
model = BartForSequenceClassification.from_pretrained(MODEL, num_labels = len(label_to_int), from_tf=False)
if torch.cuda.is_available():
    model.to('cuda') # put the model on the gpu once, and then add a batch everytime when doing a training or evaluation loop
    print_gpu_utilization()

In [4]:
# Hyperparameters (for best configuration selection)
# 1st evaluate the hyper parameters once at a time, and select each hyper parameter that gives you the best score on results aka F1 vs recall
params = {
    'batch_size': [4], #[4, 8],
    'epochs': [1, 3, 5], # [2, 3, 4, 5],
    'learning_rate': [1e-5], # [5e-6, 1e-5, 2e-5, 3e-5],
    'weight_decay': [0.01], # [0, 0.01, 0.1], # how much the weight change is shrinking
    'warmup_ratio': [0.01], #[0, 0.01, 0.1] # ratio of examples it takes to get ready for the learning rate
} 

metrics_file = f'classification_metrics_MD.csv'
if os.path.isfile(metrics_file):
    f = open(metrics_file, 'a')
else:
    f = open(metrics_file, 'w')
    f.write('batch_size,epochs,learning_rate,weight_decay,warmup_ratio,loss,accuracy,f1,precision,recall\n')

best_model = []
best_f1 = 0.0
tmp_trainer, tmp_comb = None, None
for comb in list(ParameterGrid(params)):
    print(f"Parameters: {comb}")
    training_args = TrainingArguments(
        output_dir=f'runs/MD',
        evaluation_strategy='epoch', # every epoch the model is evaluated and checkpoint is made saving the weights
        eval_steps=1, # check, probably each step is by epoch
        logging_strategy='epoch',
        weight_decay=comb['weight_decay'],
        warmup_ratio=comb['warmup_ratio'],
        num_train_epochs=comb['epochs'],
        learning_rate=comb['learning_rate'],
        per_device_train_batch_size=comb['batch_size'],
        per_device_eval_batch_size=comb['batch_size'],
        save_strategy='epoch',
        load_best_model_at_end=True,
        metric_for_best_model='eval_f1',
        adam_epsilon=1e-6,
        seed=seed,
        data_seed=seed)
    # scmetrics = evaluate.load("../scmetrics")

    trainer = Trainer(model=model,
                      args=training_args,
                      callbacks=[EarlyStoppingCallback(early_stopping_patience=3)], # if loss isnt decreasing for 2 epochs then it stops training
                      train_dataset=label_dt['train'],
                      eval_dataset=label_dt['dev'],
                      compute_metrics=compute_metrics,
                      data_collator=data_collator)
    results = trainer.train()
    results_eval = trainer.evaluate()

    v = [comb['batch_size'], comb['epochs'], comb['learning_rate'], comb['weight_decay'], comb['warmup_ratio'],
    results_eval['eval_loss'], results_eval['eval_accuracy'], results_eval['eval_f1'], results_eval['eval_precision'], results_eval['eval_recall']]
    f.write(','.join([str(el) for el in v]) + '\n')

    if results_eval['eval_f1'] > best_f1:
        best_f1 = results_eval['eval_f1']
        tmp_trainer = trainer
        tmp_comb = comb
    print('-' * 100)
    print('\n\n')

# Error analysis step
labels_to_sen = {0: 'neutral', 1: 'negative', 2: 'positive'}
if tmp_trainer is not None:
    best_trainer = tmp_trainer
    best_comb = tmp_comb
    print(f'Best parameters configuration: {best_comb}')
    # dev_pred = best_trainer.predict(tkn_dt['test'])
    # pred = np.argmax(dev_pred.predictions, axis=-1)
    # pred_score = np.max(torch.nn.functional.softmax(torch.tensor(dev_pred.predictions), dim=-1).numpy(), axis=-1)
    # i = 0
    # errors = {'FP': [], 'FN': []}
    # for pred_lab, true_lab in zip(pred, dev_pred.label_ids):
    #     if pred_lab != true_lab:
    #         if pred_lab > 1:
    #             errors['FP'].append((
    #                 tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(tkn_dt['test']['input_ids'][i])),
    #                 pred_score[i], labels_to_sen[pred_lab], labels_to_sen[true_lab]))
    #         else:
    #             errors['FN'].append((tokenizer.convert_tokens_to_string(
    #                 tokenizer.convert_ids_to_tokens(tkn_dt['test']['input_ids'][i])), pred_score[i],
    #                                  labels_to_sen[pred_lab], labels_to_sen[true_lab]))
    #     i += 1
    # with open(f'error_analysis_MD.tsv',
    #           'w') as f:
    #     f.write('sentence\tpredicted_label\ttrue_label\tprobability\n')
    #     for k, vect in errors.items():
    #         if k == 'FP':
    #             for sen in vect:
    #                 f.write(sen[0] + '\t' + f'PRED_{sen[2].upper()}' + '\t' + f'TRUE_{sen[3].upper()}' + '\t' + str(
    #                     sen[1]) + '\n')
    #             f.write('\n')
    #         else:
    #             for sen in vect:
    #                 f.write(sen[0] + '\t' + f'PRED_{sen[2].upper()}' + '\t' + f'TRUE_{sen[3].upper()}' + '\t' + str(
    #                     sen[1]) + '\n')
    test_pred = best_trainer.predict(label_dt['test'])
    print(test_pred.metrics)

    model_dir = f'runs/MD'
    # for d in os.listdir(model_dir):
        # This removes the checkpoints (comment it if you want to keep them)
        # if 'checkpoint' in d:
        #     shutil.rmtree(os.path.join(model_dir, d))
    best_trainer.save_model(
        output_dir=f'best_model/MD')
else:
    print("Precision is 0.0 change something in your model's configuration and retry.")
f.close()

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Parameters: {'batch_size': 4, 'epochs': 1, 'learning_rate': 1e-05, 'warmup_ratio': 0.01, 'weight_decay': 0.01}
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0536,0.634657,0.625,0.563636,0.785714,0.625


  metric_acc = load_metric("accuracy")
Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
----------------------------------------------------------------------------------------------------



Parameters: {'batch_size': 4, 'epochs': 3, 'learning_rate': 1e-05, 'warmup_ratio': 0.01, 'weight_decay': 0.01}
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.1468,0.903124,0.6875,0.676113,0.718182,0.6875
2,0.4514,0.732622,0.6875,0.686275,0.690476,0.6875
3,0.3054,0.690403,0.6875,0.686275,0.690476,0.6875


Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
----------------------------------------------------------------------------------------------------



Parameters: {'batch_size': 4, 'epochs': 5, 'learning_rate': 1e-05, 'warmup_ratio': 0.01, 'weight_decay': 0.01}


Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2109,1.556046,0.75,0.75,0.75,0.75
2,0.2065,2.045304,0.6875,0.686275,0.690476,0.6875
3,0.2193,2.062227,0.6875,0.686275,0.690476,0.6875
4,0.1697,1.564705,0.8125,0.811765,0.81746,0.8125
5,0.0005,1.555955,0.8125,0.811765,0.81746,0.8125


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
----------------------------------------------------------------------------------------------------



Best parameters configuration: {'batch_size': 4, 'epochs': 5, 'learning_rate': 1e-05, 'warmup_ratio': 0.01, 'weight_decay': 0.01}
{'test_loss': 2.0217905044555664, 'test_accuracy': 0.6666666666666666, 'test_f1': 0.6666666666666666, 'test_precision': 0.6666666666666666, 'test_recall': 0.6666666666666666, 'test_runtime': 69.6513, 'test_samples_per_second': 0.258, 'test_steps_per_second': 0.072}


In [None]:
Best parameters configuration 7/31/23: {'batch_size': 4, 'epochs': 5, 'learning_rate': 5e-06, 'warmup_ratio': 0, 'weight_decay': 0}
{'test_loss': 2.9686105251312256, 'test_accuracy': 0.7291666666666666, 'test_f1': 0.7261956998683634, 'test_precision': 0.7395644283121596, 
 'test_recall': 0.7291666666666666, 'test_runtime': 178.2779, 'test_samples_per_second': 0.269, 'test_steps_per_second': 0.067}

Best parameters configuration 8/2/23: {'batch_size': 4, 'epochs': 2, 'learning_rate': 5e-06, 'warmup_ratio': 0, 'weight_decay': 0.1}
{'test_loss': 2.897099256515503, 'test_accuracy': 0.7291666666666666, 'test_f1': 0.7290490664350848, 'test_precision': 0.7295652173913044, 
 'test_recall': 0.7291666666666666, 'test_runtime': 176.8441, 'test_samples_per_second': 0.271, 'test_steps_per_second': 0.068}

Best parameters configuration 8/7/23: {'batch_size': 4, 'epochs': 5, 'learning_rate': 1e-05, 'warmup_ratio': 0.01, 'weight_decay': 0.01}
{'test_loss': 2.0217905044555664, 'test_accuracy': 0.6666666666666666, 'test_f1': 0.6666666666666666, 'test_precision': 0.6666666666666666, 
 'test_recall': 0.6666666666666666, 'test_runtime': 69.6513, 'test_samples_per_second': 0.258, 'test_steps_per_second': 0.072}

In [5]:
# labels_to_sen = {0: 'neutral', 1: 'negative', 2: 'positive'}
if tmp_trainer is not None:
    best_trainer = tmp_trainer
    best_comb = tmp_comb
    print(f'Best parameters configuration: {best_comb}')
    # dev_pred = best_trainer.predict(tkn_dt['test'])
    # pred = np.argmax(dev_pred.predictions, axis=-1)
    # pred_score = np.max(torch.nn.functional.softmax(torch.tensor(dev_pred.predictions), dim=-1).numpy(), axis=-1)
    # i = 0
    # errors = {'FP': [], 'FN': []}
    # for pred_lab, true_lab in zip(pred, dev_pred.label_ids):
    #     if pred_lab != true_lab:
    #         if pred_lab > 1:
    #             errors['FP'].append((
    #                 tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(tkn_dt['test']['input_ids'][i])),
    #                 pred_score[i], labels_to_sen[pred_lab], labels_to_sen[true_lab]))
    #         else:
    #             errors['FN'].append((tokenizer.convert_tokens_to_string(
    #                 tokenizer.convert_ids_to_tokens(tkn_dt['test']['input_ids'][i])), pred_score[i],
    #                                  labels_to_sen[pred_lab], labels_to_sen[true_lab]))
    #     i += 1
    # with open(f'error_analysis_MD.tsv',
    #           'w') as f:
    #     f.write('sentence\tpredicted_label\ttrue_label\tprobability\n')
    #     for k, vect in errors.items():
    #         if k == 'FP':
    #             for sen in vect:
    #                 f.write(sen[0] + '\t' + f'PRED_{sen[2].upper()}' + '\t' + f'TRUE_{sen[3].upper()}' + '\t' + str(
    #                     sen[1]) + '\n')
    #             f.write('\n')
    #         else:
    #             for sen in vect:
    #                 f.write(sen[0] + '\t' + f'PRED_{sen[2].upper()}' + '\t' + f'TRUE_{sen[3].upper()}' + '\t' + str(
    #                     sen[1]) + '\n')
    test_pred = best_trainer.predict(label_dt['test'])
    print(test_pred.metrics)

    model_dir = f'runs/MD'
    for d in os.listdir(model_dir):
        # This removes the checkpoints (comment it if you want to keep them)
        if 'checkpoint' in d:
            shutil.rmtree(os.path.join(model_dir, d))
    best_trainer.save_model(
        output_dir=f'best_model/MD')
else:
    print("Precision is 0.0 change something in your model's configuration and retry.")
f.close()

Best parameters configuration: {'batch_size': 4, 'epochs': 2, 'learning_rate': 5e-06, 'warmup_ratio': 0, 'weight_decay': 0.1}
{'test_loss': 2.897099256515503, 'test_accuracy': 0.7291666666666666, 'test_f1': 0.7290490664350848, 'test_precision': 0.7295652173913044, 'test_recall': 0.7291666666666666, 'test_runtime': 176.8441, 'test_samples_per_second': 0.271, 'test_steps_per_second': 0.068}
