# Installations and Imports


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%capture
!pip install transformers
!pip install accelerate
!pip install sentencepiece
!pip install datasets

In [None]:
from transformers import AutoModel
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
import gc
import sentencepiece
from torch.utils.data import Dataset, DataLoader
import re
from tqdm import tqdm
from accelerate import Accelerator
import pickle

import os

In [None]:
SEED = 1111
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

#Dataset preparation

In [None]:
!wget https://nlp.stanford.edu/projects/snli/snli_1.0.zip

--2023-11-28 16:32:35--  https://nlp.stanford.edu/projects/snli/snli_1.0.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 94550081 (90M) [application/zip]
Saving to: ‘snli_1.0.zip’


2023-11-28 16:32:45 (10.6 MB/s) - ‘snli_1.0.zip’ saved [94550081/94550081]



In [None]:
from zipfile import ZipFile

# specifying the zip file name
file_name = "snli_1.0.zip"

# opening the zip file in READ mode
with ZipFile(file_name, 'r') as zip:
    # printing all the contents of the zip file
    zip.printdir()

    # extracting all the files
    print('Extracting all the files now...')
    zip.extractall()
    print('Done!')

File Name                                             Modified             Size
snli_1.0/                                      2015-08-29 08:57:10            0
snli_1.0/.DS_Store                             2015-08-29 08:57:16         6148
__MACOSX/                                      2015-08-29 09:00:04            0
__MACOSX/snli_1.0/                             2015-08-29 09:00:04            0
__MACOSX/snli_1.0/._.DS_Store                  2015-08-29 08:57:16          120
snli_1.0/Icon                                 2015-05-21 16:21:08            0
__MACOSX/snli_1.0/._Icon                      2015-05-21 16:21:08       340709
snli_1.0/README.txt                            2015-08-29 08:59:48         5828
__MACOSX/snli_1.0/._README.txt                 2015-08-29 08:59:48          171
snli_1.0/snli_1.0_dev.jsonl                    2015-08-17 10:34:22      9745714
snli_1.0/snli_1.0_dev.txt                      2015-08-17 10:34:24      7565773
snli_1.0/snli_1.0_test.jsonl            

In [None]:
df_train = pd.read_csv('snli_1.0/snli_1.0_train.txt', sep='\t')
df_dev = pd.read_csv('snli_1.0/snli_1.0_dev.txt', sep='\t')
df_test = pd.read_csv('snli_1.0/snli_1.0_test.txt', sep='\t')

df_train = df_train[['gold_label','sentence1','sentence2']]
df_dev = df_dev[['gold_label','sentence1','sentence2']]
df_test = df_test[['gold_label','sentence1','sentence2']]

print(len(df_train), len(df_dev), len(df_test))

550152 10000 10000


In [None]:
max_sentence_len=100
def trim_sentence(sent):
    try:
        sent = sent.split()
        sent = sent[:max_sentence_len]
        return " ".join(sent)
    except:
        return sent

In [None]:
df_train['sentence1'] = df_train['sentence1'].apply(trim_sentence)
df_train['sentence2'] = df_train['sentence2'].apply(trim_sentence)
df_dev['sentence1'] = df_dev['sentence1'].apply(trim_sentence)
df_dev['sentence2'] = df_dev['sentence2'].apply(trim_sentence)
df_test['sentence1'] = df_test['sentence1'].apply(trim_sentence)
df_test['sentence2'] = df_test['sentence2'].apply(trim_sentence)

df_train = df_train.loc[df_train['gold_label'].isin(['entailment','contradiction'])]
df_dev = df_dev.loc[df_dev['gold_label'].isin(['entailment','contradiction'])]
df_test = df_test.loc[df_test['gold_label'].isin(['entailment','contradiction'])]

print(len(df_train), len(df_dev), len(df_test))

366603 6607 6605


In [None]:
df_train.head(2)

Unnamed: 0,gold_label,sentence1,sentence2
1,contradiction,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."
2,entailment,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse."


In [None]:
labels_dict={
    "entailment":0,
    'contradiction':1,
}

In [None]:
def prepare_df(input_df):
  gold_labels=[]
  first_sents=[]
  second_sents=[]

  for i in tqdm(range(len(input_df))):
    sample=input_df.iloc[i]

    sentence1=sample['sentence1']
    sentence2=sample['sentence2']

    if type(sentence1)!=str:
      sentence1=str(sentence1)

    if type(sentence2)!=str:
      sentence2=str(sentence2)

    first_sents.append(sentence1)
    second_sents.append(sentence2)
    gold_labels.append(labels_dict[sample['gold_label']])

  temp_dict={
    'gold_label':gold_labels,
    'sentence1': first_sents,
    'sentence2': second_sents
  }
  df=pd.DataFrame(data=temp_dict)

  return df

In [None]:
train_df=prepare_df(df_train)
dev_df=prepare_df(df_dev)
test_df=prepare_df(df_test)

100%|██████████| 366603/366603 [00:29<00:00, 12295.05it/s]
100%|██████████| 6607/6607 [00:00<00:00, 15389.76it/s]
100%|██████████| 6605/6605 [00:00<00:00, 16476.56it/s]


In [None]:
train_df

Unnamed: 0,gold_label,sentence1,sentence2
0,1,A person on a horse jumps over a broken down a...,"A person is at a diner, ordering an omelette."
1,0,A person on a horse jumps over a broken down a...,"A person is outdoors, on a horse."
2,0,Children smiling and waving at camera,There are children present
3,1,Children smiling and waving at camera,The kids are frowning
4,1,A boy is jumping on skateboard in the middle o...,The boy skates down the sidewalk.
...,...,...,...
366598,0,A dog with a blue collar plays ball outside.,a dog is outside
366599,0,Four dirty and barefooted children.,four children have dirty feet.
366600,1,Four dirty and barefooted children.,four kids won awards for 'cleanest feet'
366601,1,A man is surfing in a bodysuit in beautiful bl...,A man in a business suit is heading to a board...


In [None]:
from transformers import AutoTokenizer, DebertaForSequenceClassification

model_checkpoint = "microsoft/deberta-v3-large"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

In [None]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict({'train': Dataset.from_pandas(train_df.iloc[upper_index:]), 'dev':Dataset.from_pandas(dev_df), 'test':Dataset.from_pandas(test_df)})
dataset

DatasetDict({
    train: Dataset({
        features: ['gold_label', 'sentence1', 'sentence2'],
        num_rows: 206603
    })
    dev: Dataset({
        features: ['gold_label', 'sentence1', 'sentence2'],
        num_rows: 6607
    })
    test: Dataset({
        features: ['gold_label', 'sentence1', 'sentence2'],
        num_rows: 6605
    })
})

In [None]:
max_length = 256

def tokenize_preprocess_function(examples):

    input=tokenizer(examples['sentence1'], examples['sentence2'], max_length=max_length, truncation=True, padding=True, return_tensors="pt")

    data_dict={
        'input_ids': input['input_ids'],
        'label':torch.tensor(examples['gold_label']),
        }

    return data_dict

tokenized_datasets = dataset.map(tokenize_preprocess_function, batched=True)
tokenized_datasets

Map:   0%|          | 0/206603 [00:00<?, ? examples/s]

Map:   0%|          | 0/6607 [00:00<?, ? examples/s]

Map:   0%|          | 0/6605 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['gold_label', 'sentence1', 'sentence2', 'input_ids', 'label'],
        num_rows: 206603
    })
    dev: Dataset({
        features: ['gold_label', 'sentence1', 'sentence2', 'input_ids', 'label'],
        num_rows: 6607
    })
    test: Dataset({
        features: ['gold_label', 'sentence1', 'sentence2', 'input_ids', 'label'],
        num_rows: 6605
    })
})

#Huggingface Model and Trainer

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

num_labels=2

model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)

In [None]:
from datasets import load_metric

task="mnli"
metric = load_metric('glue', task)

  metric = load_metric('glue', task)


In [None]:
batch_size=16
metric_name = "accuracy"
model_checkpoint = "microsoft/deberta-v3-large"
model_name = model_checkpoint.split("/")[-1]

args = TrainingArguments(
    output_dir=f"/content/drive/MyDrive/shroom/{model_name}-finetuned-{task}",
    seed=0,
    evaluation_strategy = "steps",
    save_strategy = "steps",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=1,
    save_steps=500,
    logging_steps=500,
    weight_decay=0.01,
    load_best_model_at_end=True,
    save_total_limit=2,
    metric_for_best_model=metric_name,
    push_to_hub=False,
    fp16=True
)

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets['dev'],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
500,0.1072,0.091101,0.977902
1000,0.1105,0.11009,0.972756
1500,0.1162,0.10584,0.975026
2000,0.1188,0.120354,0.970334
2500,0.1125,0.080829,0.980021
3000,0.1013,0.074819,0.979718
3500,0.1084,0.083345,0.980324
4000,0.1016,0.086572,0.981081
4500,0.0969,0.076333,0.980627
5000,0.1071,0.080635,0.980475


Could not locate the best model at /content/drive/MyDrive/shroom/deberta-v3-large-finetuned-mnli/checkpoint-11000/pytorch_model.bin, if you are running a distributed training on multiple nodes, you should activate `--save_on_each_node`.


TrainOutput(global_step=12913, training_loss=0.09185990686799927, metrics={'train_runtime': 8971.1104, 'train_samples_per_second': 23.03, 'train_steps_per_second': 1.439, 'total_flos': 3.3918724615316988e+16, 'train_loss': 0.09185990686799927, 'epoch': 1.0})

In [None]:
trainer.save_model(f"/content/drive/MyDrive/shroom/{model_name}-finetuned-{task}_best.pt")

In [None]:
trainer.evaluate()

{'eval_loss': 0.051058895885944366,
 'eval_accuracy': 0.9859240199788104,
 'eval_runtime': 46.8023,
 'eval_samples_per_second': 141.168,
 'eval_steps_per_second': 8.824,
 'epoch': 1.0}

In [None]:
predictions = trainer.predict(tokenized_datasets["test"])

In [None]:
def compute_test_metrics(predictions, labels):
    predictions = np.argmax(predictions.predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
compute_test_metrics(predictions, tokenized_datasets["test"]['label'])

{'accuracy': 0.9851627554882665}

In [None]:
# trainer.push_to_hub()