#### 1) Locate and download the Stanford NLI (SNLI) dataset from the Hugging Face hub.

In [None]:
!pip install datasets

In [1]:
from datasets import load_dataset

SNLI_dataset = load_dataset('snli')

Found cached dataset snli (C:/Users/Zoe/.cache/huggingface/datasets/snli/plain_text/1.0.0/1f60b67533b65ae0275561ff7828aad5ee4282d0e6f844fd148d05d3c6ea251b)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
print(SNLI_dataset)

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 550152
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 10000
    })
})


#### 2) Use the t5-small model to translate the text to French, and build a new dataset of French texts.

In [5]:
!pip install datsets transformers[sentencepiece]
!pip install sentencepiece
!pip install transformers

Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.2 transformers-4.26.1


In [2]:
import torch
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained( "t5-small")

In [25]:
# Define function to translate a text to French
def translate_text(text):
    input_ids = tokenizer.encode(text, return_tensors='pt')
    output_ids = model.generate(input_ids, max_length=50, num_beams=4, early_stopping=True)
    translation = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return translation

In [26]:
# Define function to translate premise and hypothesis in a SNLI example to French
def translate_example(example):
    premise_fr = translate_text(example['premise'])
    hypothesis_fr = translate_text(example['hypothesis'])
    label = example['label']
    return {'premise': premise_fr, 'hypothesis': hypothesis_fr, 'label': label}

In [None]:
!pip install pandarallel

In [2]:
import pandas as pd
from pandarallel import pandarallel

In [28]:
train_dataset = pd.DataFrame(SNLI_dataset['train'][0:1000])
test_dataset = pd.DataFrame(SNLI_dataset['test'][0:1000])
dev_dataset = pd.DataFrame(SNLI_dataset['validation'][0:1000])

In [29]:
unique_premise = pd.DataFrame(pd.concat([train_dataset['premise'], test_dataset['premise'], dev_dataset['premise']]).unique(), columns=['premise'])
unique_hypothesis = pd.DataFrame(pd.concat([train_dataset['hypothesis'], test_dataset['hypothesis'], dev_dataset['hypothesis']]).unique(), columns=['hypothesis'])

In [30]:
unique_premise.size

930

In [31]:
unique_hypothesis.size

2980

In [32]:
# Translate SNLI dataset to French

pandarallel.initialize(nb_workers=10, progress_bar=True)
translate_prompt = 'Translate from English to French: '

unique_premise['premise_fr'] = translate_prompt + unique_premise['premise'].astype(str)
unique_hypothesis['hypothesis_fr'] = translate_prompt + unique_hypothesis['hypothesis'].astype(str)

unique_premise['premise_fr'] = unique_premise['premise_fr'].parallel_apply(translate_text)
unique_hypothesis['hypothesis_fr'] = unique_hypothesis['hypothesis_fr'].parallel_apply(translate_text)

INFO: Pandarallel will run on 10 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=93), Label(value='0 / 93'))), HBox…

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=298), Label(value='0 / 298'))), HB…

In [38]:
premise_dict = unique_premise.set_index('premise')['premise_fr'].to_dict()
hypo_dict = unique_hypothesis.set_index('hypothesis')['hypothesis_fr'].to_dict()

In [40]:
def add_translations_to_df(df):
  df['premise_fr'] = df['premise'].map(premise_dict)
  df['hypothesis_fr'] = df['hypothesis'].map(hypo_dict)

add_translations_to_df(train_dataset)
add_translations_to_df(test_dataset)
add_translations_to_df(dev_dataset)

In [41]:
print(train_dataset.head())

                                             premise  \
0  A person on a horse jumps over a broken down a...   
1  A person on a horse jumps over a broken down a...   
2  A person on a horse jumps over a broken down a...   
3              Children smiling and waving at camera   
4              Children smiling and waving at camera   

                                          hypothesis  label  \
0  A person is training his horse for a competition.      1   
1      A person is at a diner, ordering an omelette.      2   
2                  A person is outdoors, on a horse.      0   
3                  They are smiling at their parents      1   
4                         There are children present      0   

                                          premise_fr  \
0  Une personne sur un cheval saute au-dessus d'u...   
1  Une personne sur un cheval saute au-dessus d'u...   
2  Une personne sur un cheval saute au-dessus d'u...   
3            Enfants souriants et waving à huis clos   
4   

In [42]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
path = '/content/drive/My Drive/NLP244/'

In [None]:
# uncomment to save translations

# train_dataset.to_csv(path + 'train_dataset.csv')
# test_dataset.to_csv(path + 'test_dataset.csv')
# dev_dataset.to_csv(path + 'dev_dataset.csv')

In [259]:
# uncomment to load translations

# train_dataset = pd.read_csv(path + 'train_dataset.csv')
# test_dataset = pd.read_csv(path + 'test_dataset.csv')
# dev_dataset = pd.read_csv(path + 'dev_dataset.csv')

In [260]:
from sklearn.model_selection import train_test_split

combined_df = pd.concat([train_dataset, dev_dataset], ignore_index=True)
train_dataset, dev_dataset = train_test_split(combined_df, test_size=0.2, random_state=42)


#### 3) Now you have a working dataset for French Natural Language Inference (FNLI). Build a HuggingFace Dataset class out of it.

In [261]:
import datasets
from datasets import Dataset

# Using my personal truncated datset due to time constraints

train_dataset = Dataset.from_dict({'premise': train_dataset['premise_fr'].astype(str), 'hypothesis': train_dataset['hypothesis_fr'].astype(str), 'label': train_dataset['label']})
test_dataset = Dataset.from_dict({'premise': test_dataset['premise_fr'].astype(str), 'hypothesis': test_dataset['hypothesis_fr'].astype(str), 'label': test_dataset['label']})
eval_dataset = Dataset.from_dict({'premise': dev_dataset['premise_fr'].astype(str), 'hypothesis': dev_dataset['hypothesis_fr'].astype(str), 'label': dev_dataset['label']})

# create a HuggingFace Dataset from the dictionary
fnli_dataset = datasets.DatasetDict({'train': train_dataset, 'test': test_dataset, 'validation': eval_dataset})

In [None]:
# # using Brandon's pre-translated dataset for fine-tuning # did not use because of time constraints
# fnli_dataset = load_dataset("Brendan/nlp244_french_snli")

In [262]:
print(fnli_dataset)

DatasetDict({
    train: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1600
    })
    test: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 1000
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label'],
        num_rows: 400
    })
})


#### 4) Finetune a French language model DistilCamemBERT with the dataset you created in Step 3. Modify your use of the Trainer class to additionally evaluate F1 (Hint: look at the compute_metrics flag).

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("cmarkea/distilcamembert-base-nli")
model = AutoModelForSequenceClassification.from_pretrained("cmarkea/distilcamembert-base-nli")

In [264]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

In [None]:
def preprocess_function(examples):
    return tokenizer(examples['premise'], examples['hypothesis'], truncation=True, padding=True, return_token_type_ids=True)

tokenized_fnli = fnli_dataset.map(preprocess_function, batched=True)

In [None]:
train_dataset = tokenized_fnli['train'].filter(lambda example: example['label'] != -1)
val_dataset = tokenized_fnli['validation'].filter(lambda example: example['label'] != -1)
test_dataset = tokenized_fnli['test'].filter(lambda example: example['label'] != -1)

train_dataset.set_format(type="pt", columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
test_dataset.set_format(type="pt", columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])
val_dataset.set_format(type="pt", columns=['input_ids', 'attention_mask', 'token_type_ids', 'label'])


In [269]:
from transformers import TrainingArguments, Trainer, EvalPrediction
from sklearn.metrics import f1_score
import numpy as np


# Define the metric and evaluation modules
def my_compute_metrics(eval_pred: EvalPrediction) -> dict:
    logits, labels = eval_pred.predictions, eval_pred.label_ids
    predictions = np.argmax(logits, axis=1)
    accuracy = np.mean(predictions == labels)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": accuracy, "f1_score": f1}

In [None]:
!pip install wandb

In [272]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir = './results',
    do_train=True,
    do_eval=True,
    evaluation_strategy='epoch',
    learning_rate=1e-3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    num_train_epochs=5,
    weight_decay=0.01,
    save_strategy='epoch',
    metric_for_best_model='f1_score',
    logging_dir='./logs',
    logging_steps=10,
    report_to='wandb',
    run_name='distilcamembert-finetuning-fnli',
    load_best_model_at_end=True,
    dataloader_num_workers=1  # set to 0 when debugging and >1 when running!
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=my_compute_metrics
)

# Train the model
trainer.train()

PyTorch: setting up devices
The following columns in the training set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1583
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 250
  Number of trainable parameters = 68097027
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


  0%|          | 0/250 [00:00<?, ?it/s]

{'loss': 1.358, 'learning_rate': 1.9200000000000003e-05, 'epoch': 0.2}
{'loss': 1.023, 'learning_rate': 1.8400000000000003e-05, 'epoch': 0.4}
{'loss': 0.9001, 'learning_rate': 1.76e-05, 'epoch': 0.6}
{'loss': 0.8951, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.8}
{'loss': 0.9512, 'learning_rate': 1.6000000000000003e-05, 'epoch': 1.0}


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 395
  Batch size = 64


  0%|          | 0/7 [00:00<?, ?it/s]

Saving model checkpoint to ./results\checkpoint-50
Configuration saved in ./results\checkpoint-50\config.json


{'eval_loss': 0.7855406999588013, 'eval_accuracy': 0.6455696202531646, 'eval_f1_score': 0.6422018329342085, 'eval_runtime': 24.9991, 'eval_samples_per_second': 15.801, 'eval_steps_per_second': 0.28, 'epoch': 1.0}


Model weights saved in ./results\checkpoint-50\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-50\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-50\special_tokens_map.json


{'loss': 0.7514, 'learning_rate': 1.5200000000000002e-05, 'epoch': 1.2}
{'loss': 0.832, 'learning_rate': 1.4400000000000001e-05, 'epoch': 1.4}
{'loss': 0.7207, 'learning_rate': 1.3600000000000002e-05, 'epoch': 1.6}
{'loss': 0.7601, 'learning_rate': 1.2800000000000001e-05, 'epoch': 1.8}
{'loss': 0.7963, 'learning_rate': 1.2e-05, 'epoch': 2.0}


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 395
  Batch size = 64


  0%|          | 0/7 [00:00<?, ?it/s]

Saving model checkpoint to ./results\checkpoint-100
Configuration saved in ./results\checkpoint-100\config.json


{'eval_loss': 0.7417420148849487, 'eval_accuracy': 0.6911392405063291, 'eval_f1_score': 0.6890609249119273, 'eval_runtime': 24.6467, 'eval_samples_per_second': 16.027, 'eval_steps_per_second': 0.284, 'epoch': 2.0}


Model weights saved in ./results\checkpoint-100\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-100\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-100\special_tokens_map.json


{'loss': 0.6759, 'learning_rate': 1.1200000000000001e-05, 'epoch': 2.2}
{'loss': 0.695, 'learning_rate': 1.04e-05, 'epoch': 2.4}
{'loss': 0.6776, 'learning_rate': 9.600000000000001e-06, 'epoch': 2.6}
{'loss': 0.6639, 'learning_rate': 8.8e-06, 'epoch': 2.8}
{'loss': 0.6633, 'learning_rate': 8.000000000000001e-06, 'epoch': 3.0}


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 395
  Batch size = 64


  0%|          | 0/7 [00:00<?, ?it/s]

Saving model checkpoint to ./results\checkpoint-150
Configuration saved in ./results\checkpoint-150\config.json


{'eval_loss': 0.7754604816436768, 'eval_accuracy': 0.7189873417721518, 'eval_f1_score': 0.7189957752827394, 'eval_runtime': 26.7742, 'eval_samples_per_second': 14.753, 'eval_steps_per_second': 0.261, 'epoch': 3.0}


Model weights saved in ./results\checkpoint-150\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-150\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-150\special_tokens_map.json


{'loss': 0.6231, 'learning_rate': 7.2000000000000005e-06, 'epoch': 3.2}
{'loss': 0.5532, 'learning_rate': 6.4000000000000006e-06, 'epoch': 3.4}
{'loss': 0.5699, 'learning_rate': 5.600000000000001e-06, 'epoch': 3.6}
{'loss': 0.5733, 'learning_rate': 4.800000000000001e-06, 'epoch': 3.8}
{'loss': 0.6458, 'learning_rate': 4.000000000000001e-06, 'epoch': 4.0}


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 395
  Batch size = 64


  0%|          | 0/7 [00:00<?, ?it/s]

Saving model checkpoint to ./results\checkpoint-200
Configuration saved in ./results\checkpoint-200\config.json


{'eval_loss': 0.7543293237686157, 'eval_accuracy': 0.7189873417721518, 'eval_f1_score': 0.7232986422859841, 'eval_runtime': 24.9156, 'eval_samples_per_second': 15.854, 'eval_steps_per_second': 0.281, 'epoch': 4.0}


Model weights saved in ./results\checkpoint-200\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-200\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-200\special_tokens_map.json


{'loss': 0.5382, 'learning_rate': 3.2000000000000003e-06, 'epoch': 4.2}
{'loss': 0.5855, 'learning_rate': 2.4000000000000003e-06, 'epoch': 4.4}
{'loss': 0.51, 'learning_rate': 1.6000000000000001e-06, 'epoch': 4.6}
{'loss': 0.5541, 'learning_rate': 8.000000000000001e-07, 'epoch': 4.8}
{'loss': 0.5367, 'learning_rate': 0.0, 'epoch': 5.0}


The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 395
  Batch size = 64


  0%|          | 0/7 [00:00<?, ?it/s]

Saving model checkpoint to ./results\checkpoint-250
Configuration saved in ./results\checkpoint-250\config.json


{'eval_loss': 0.7396425604820251, 'eval_accuracy': 0.7164556962025317, 'eval_f1_score': 0.7156849969709582, 'eval_runtime': 24.3651, 'eval_samples_per_second': 16.212, 'eval_steps_per_second': 0.287, 'epoch': 5.0}


Model weights saved in ./results\checkpoint-250\pytorch_model.bin
tokenizer config file saved in ./results\checkpoint-250\tokenizer_config.json
Special tokens file saved in ./results\checkpoint-250\special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from ./results\checkpoint-200 (score: 0.7232986422859841).


{'train_runtime': 1486.3179, 'train_samples_per_second': 5.325, 'train_steps_per_second': 0.168, 'train_loss': 0.7221389923095703, 'epoch': 5.0}


TrainOutput(global_step=250, training_loss=0.7221389923095703, metrics={'train_runtime': 1486.3179, 'train_samples_per_second': 5.325, 'train_steps_per_second': 0.168, 'train_loss': 0.7221389923095703, 'epoch': 5.0})

#### 5) Run your best model in inference mode, and calculate the test F1 score for your FNLI dataset/model.

In [274]:
# Evaluate the model on the test set
model.eval()
eval_results = trainer.evaluate(test_dataset)

The following columns in the evaluation set don't have a corresponding argument in `CamembertForSequenceClassification.forward` and have been ignored: premise, hypothesis. If premise, hypothesis are not expected by `CamembertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 988
  Batch size = 64


  0%|          | 0/16 [00:00<?, ?it/s]

In [275]:
print(eval_results)

{'eval_loss': 0.7963597774505615, 'eval_accuracy': 0.6821862348178138, 'eval_f1_score': 0.6850939350882376, 'eval_runtime': 56.7471, 'eval_samples_per_second': 17.411, 'eval_steps_per_second': 0.282, 'epoch': 5.0}
