<a href="https://colab.research.google.com/github/vempaliakhil96/kaggle-entailment-competition/blob/main/03-exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [None]:
! pip install fastkaggle fastai pandas fastcore tqdm datasets transformers[torch] accelerate evaluate --quiet

In [None]:
from fastkaggle import *
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from fastai.text.all import *
from tqdm import tqdm
import pandas as pd
import numpy as np
import evaluate
import datasets
from transformers import TrainingArguments,Trainer
import torch
from torch.utils.data import DataLoader
import os
from google.colab import userdata

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
tqdm.pandas()


In [None]:
if not iskaggle: api = import_kaggle()

In [None]:
comp_name = "contradictory-my-dear-watson"

In [None]:
dpath = setup_comp(comp_name)

In [None]:
dpath.ls()

(#3) [Path('contradictory-my-dear-watson/test.csv'),Path('contradictory-my-dear-watson/sample_submission.csv'),Path('contradictory-my-dear-watson/train.csv')]

In [None]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
device = "cuda" if torch.cuda.is_available() else device
mname = 'cross-encoder/nli-distilroberta-base'
mname = 'microsoft/deberta-v3-small'
model = AutoModelForSequenceClassification.from_pretrained(mname, num_labels=3).to(device)
tokenizer = AutoTokenizer.from_pretrained(mname)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
def baseline_prediction(premise, hypothesis):
    toks = tokenizer([premise], [hypothesis], return_tensors='pt', padding=True, truncation=True).to(device)
    scores = model(**toks).logits
    label_mapping = ['contradiction', 'entailment', 'neutral']
    label = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
    return label[0]

def bulk_baseline_predict(premises, hypotheses):
    labels = []
    bs = 8
    for i in tqdm(range(0, len(premises), bs)):
        toks = tokenizer(premises[i:i+bs], hypotheses[i:i+bs], return_tensors='pt', padding=True, truncation=True).to(device)
        scores = model(**toks).logits
        label_mapping = ['contradiction', 'entailment', 'neutral']
        _labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
        labels.extend(_labels)
    return labels

In [None]:
comp_id2label = {
    0: "entailment",
    1: "neutral",
    2: "contradiction"
}
label2comp_id = {v:k for k,v in comp_id2label.items()}
model.config.label2id = label2comp_id
model.config.id2label = comp_id2label

In [None]:
tokenizer.model_max_length = 512

# Data Processing

In [None]:
train_df = pd.read_csv(dpath/"train.csv")
test_df = pd.read_csv(dpath/"test.csv")

def _process_text(text): return fix_html(rm_useless_spaces(spec_add_spaces(text)))

def tokfn(x): return tokenizer(x["input"], truncation=True)

train_df.premise = train_df.premise.apply(_process_text)
train_df.hypothesis = train_df.hypothesis.apply(_process_text)
train_df.label = train_df.label.map(comp_id2label).map(model.config.label2id)
test_df.premise = test_df.premise.apply(_process_text)
test_df.hypothesis = test_df.hypothesis.apply(_process_text)
train_df["input"] = "premise: " + train_df.premise + "\nhypothesis: " + train_df.hypothesis
test_df["input"] = "premise: " + test_df.premise + "\nhypothesis: " + test_df.hypothesis

train_ds = datasets.Dataset.from_pandas(train_df[["input", "label"]])
test_ds = datasets.Dataset.from_pandas(test_df[["id", "input"]])
train_ds = train_ds.train_test_split(test_size=0.1)

In [None]:
train_ds = train_ds.map(tokfn, batched=True)

Map:   0%|          | 0/10908 [00:00<?, ? examples/s]

Map:   0%|          | 0/1212 [00:00<?, ? examples/s]

## Training

In [None]:
bs = 8*2
epochs = 4
lr = 8e-5

In [None]:
args = TrainingArguments('my-dear-watson-nli-model',
                         learning_rate=lr,
                         warmup_ratio=0.1,
                         lr_scheduler_type='cosine',
                         fp16=True if torch.cuda.is_available() else False,
                         evaluation_strategy="epoch",
                         per_device_train_batch_size=bs,
                         per_device_eval_batch_size=bs*2,
                         num_train_epochs=epochs,
                         weight_decay=0.01,
                         report_to='none')
accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(model, args,
                  train_dataset=train_ds['train'],
                  eval_dataset=train_ds['test'],
                  tokenizer=tokenizer,
                  compute_metrics=compute_metrics)

In [None]:
trainer.train();

Epoch,Training Loss,Validation Loss,Accuracy
1,0.9568,0.777878,0.655116
2,0.719,0.737985,0.688119
3,0.4004,0.916771,0.683168
4,0.2709,1.111816,0.697195


In [None]:
import huggingface_hub; huggingface_hub.login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
trainer.push_to_hub()

CommitInfo(commit_url='https://huggingface.co/vempaliakhil/my-dear-watson-nli-model/commit/c29bc653b9901e114fe4446b5401a1be5266f210', commit_message='End of training', commit_description='', oid='c29bc653b9901e114fe4446b5401a1be5266f210', pr_url=None, pr_revision=None, pr_num=None)

## Eval

In [None]:
p = pipeline("text-classification", "vempaliakhil/my-dear-watson-nli-model", device=device)

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/568M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.26k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

In [None]:
p(train_ds["test"][1]["input"])

[{'label': 'entailment', 'score': 0.998528242111206}]

In [None]:
len(test_ds)

5195

In [None]:
predictions = []
for i in tqdm(range(0, len(test_ds), bs)): predictions.extend(p(test_ds["input"][i:i+bs]))

100%|██████████| 325/325 [01:13<00:00,  4.42it/s]


In [None]:
if "prediction" in test_ds.features: test_ds = test_ds.remove_columns(["prediction"])
test_ds = test_ds.add_column("prediction", [o["label"] for o in predictions])

In [None]:
test_ds = test_ds.map(lambda x: dict(prediction=label2comp_id[x["prediction"]]))

Map:   0%|          | 0/5195 [00:00<?, ? examples/s]

In [None]:
# if not iskaggle:
#     push_notebook('vempaliakhil96', '03-exp',
#                   title='03-exp',
#                   file='03-exp.ipynb',
#                   competition=comp_name,
#                   private=False,
#                   gpu=True)