In [8]:
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, load_metric
import random
from transformers import BartTokenizerFast, BartForSequenceClassification, Trainer, TrainingArguments, EvalPrediction, pipeline, set_seed, DataCollatorWithPadding

seed = 42

set_seed(seed)
tokenizer = BartTokenizerFast.from_pretrained('facebook/bart-large-mnli')
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

df = pd.read_csv('../data/MD-NLI.csv', header=0)
df = df[["language", "MD_label"]]
dataset = Dataset.from_pandas(df).rename_columns({'language': 'text', "MD_label": 'class'})
label_dt = dataset.train_test_split(0.5, seed = seed)
# label_dt = DatasetDict({
#     'train': train_test['train'],
#     'dev': dev_test['train'],
#     'test': dev_test['test']})

print(label_dt)

label_to_int = ["neutral", "negative", "positive"]
# label_to_int = [0, 1, 2]
template = "The sentiment of this sentence is {}"

def create_input_sequence(sample):
    text = sample["text"]
    label = sample["class"][0]
    contradiction_label = random.choice([x for x in label_to_int if x != label])
    encoded_sequence = tokenizer(text * 2, [template.format(label), template.format(contradiction_label)], truncation = True, padding = 'max_length')
    encoded_sequence["labels"] = [2, 0]
    encoded_sequence["input_sentence"] = tokenizer.batch_decode(encoded_sequence.input_ids)
    return encoded_sequence


# # Split to train and test portions
# df_train = df.head(train_portion)
# df_test = df.tail(test_portion)
# # Convert to Dataset objects
# train_ds = Dataset.from_pandas(df_train, split="train")
# test_ds = Dataset.from_pandas(df_test, split="test")
# # create mappings
label_dt = label_dt.map(create_input_sequence, batched = True, batch_size = 1, remove_columns = ["class", "text"])
# test_dataset = test_ds.map(create_input_sequence, batched = True, batch_size = 1, remove_columns = ["class", "text"])
print(label_dt)
model = BartForSequenceClassification.from_pretrained("facebook/bart-large-mnli", num_labels = len(label_to_int))

DatasetDict({
    train: Dataset({
        features: ['text', 'class'],
        num_rows: 23
    })
    test: Dataset({
        features: ['text', 'class'],
        num_rows: 24
    })
})


Map:   0%|          | 0/23 [00:00<?, ? examples/s]

Map:   0%|          | 0/24 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 46
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels', 'input_sentence'],
        num_rows: 48
    })
})


In [10]:
from transformers import BartForSequenceClassification, Trainer, TrainingArguments, EvalPrediction
import numpy as np

training_args = TrainingArguments(
    output_dir = 'runs/MD',      # Output directory
    num_train_epochs = 1,             # Total number of training epochs
    # per_device_train_batch_size = 16,  # Batch size per device during training
    # per_device_eval_batch_size = 64,   # Batch size for evaluation
    warmup_ratio = 0.01,                # Warmup ratio for learning rate scheduler
    weight_decay = 0.01,               # Strength of weight decay
)

def compute_metrics(p: EvalPrediction):
    metric_acc = load_metric("accuracy")
    metric_f1 = load_metric("f1")
    metric_precision = load_metric("precision")
    metric_recall = load_metric("recall")
    preds = p.predictions[0] if isinstance(p.predictions, tuple) else p.predictions
    preds = np.argmax(preds, axis = 1)
    result = {}
    result["accuracy"] = metric_acc.compute(predictions = preds, references = p.label_ids)["accuracy"]
    result["f1"] = metric_f1.compute(predictions = preds, references = p.label_ids, pos_label=1, average = "weighted")["f1"] #play with "weighted" "micro" "macro"
    result["precision"] = metric_precision.compute(predictions = preds, references = p.label_ids, pos_label=1, average="weighted", sample_weight=None, zero_division='warn')["precision"]
    result["recall"] = metric_recall.compute(predictions = preds, references = p.label_ids, pos_label=1, average="weighted", sample_weight=None, zero_division='warn')["recall"]
    return result

trainer = Trainer(
    model = model,                     # The instantiated model to be trained
    args = training_args,              # Training arguments, defined above
    compute_metrics = compute_metrics, # A function to compute the metrics
    train_dataset = label_dt['train'],     # Training dataset
    eval_dataset = label_dt['test'],       # Evaluation dataset
    tokenizer = tokenizer,              # The tokenizer that was used
    data_collator=data_collator
)

trainer.train()
trainer.evaluate()
# model.save_pretrained(model_path)

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Step,Training Loss


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



  metric_acc = load_metric("accuracy")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




{'eval_loss': 0.6754849553108215,
 'eval_accuracy': 0.5208333333333334,
 'eval_f1': 0.48290398126463696,
 'eval_precision': 0.5294840294840294,
 'eval_recall': 0.5208333333333334,
 'eval_runtime': 173.1239,
 'eval_samples_per_second': 0.277,
 'eval_steps_per_second': 0.035,
 'epoch': 1.0}

Fatal Python error: config_get_locale_encoding: failed to get the locale encoding: nl_langinfo(CODESET) failed
Python runtime state: preinitialized



In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification", model = model, tokenizer = tokenizer, device = 0)
classifier(sequences, label_to_int, multi_label=False)