<a href="https://colab.research.google.com/github/vempaliakhil96/kaggle-entailment-competition/blob/main/03-exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install fastkaggle fastai pandas fastcore tqdm datasets transformers[torch] accelerate --quiet

In [None]:
from fastkaggle import *
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from fastai.text.all import *
from tqdm import tqdm
import pandas as pd
import datasets
from transformers import TrainingArguments,Trainer
import torch
from torch.utils.data import DataLoader
import os
from google.colab import userdata

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
tqdm.pandas()


In [None]:
if not iskaggle: api = import_kaggle()

In [None]:
comp_name = "contradictory-my-dear-watson"

In [None]:
dpath = setup_comp(comp_name)

In [None]:
dpath.ls()

In [None]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
device = "cuda" if torch.cuda.is_available() else device
mname = 'cross-encoder/nli-distilroberta-base'
model = AutoModelForSequenceClassification.from_pretrained(mname, num_labels=3).to(device)
tokenizer = AutoTokenizer.from_pretrained(mname)


In [None]:
def baseline_prediction(premise, hypothesis):
    toks = tokenizer([premise], [hypothesis], return_tensors='pt', padding=True, truncation=True).to(device)
    scores = model(**toks).logits
    label_mapping = ['contradiction', 'entailment', 'neutral']
    label = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
    return label[0]

def bulk_baseline_predict(premises, hypotheses):
    labels = []
    bs = 8
    for i in tqdm(range(0, len(premises), bs)):
        toks = tokenizer(premises[i:i+bs], hypotheses[i:i+bs], return_tensors='pt', padding=True, truncation=True).to(device)
        scores = model(**toks).logits
        label_mapping = ['contradiction', 'entailment', 'neutral']
        _labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
        labels.extend(_labels)
    return labels

In [None]:
comp_id2label = {
    0: "entailment",
    1: "neutral",
    2: "contradiction"
}
label2comp_id = {v:k for k,v in comp_id2label.items()}

In [None]:
train_df = pd.read_csv(dpath/"train.csv")
test_df = pd.read_csv(dpath/"test.csv")

def _process_text(text): return fix_html(rm_useless_spaces(spec_add_spaces(text)))

def tokfn(x): return tokenizer(x["input"])

train_df.premise = train_df.premise.apply(_process_text)
train_df.hypothesis = train_df.hypothesis.apply(_process_text)
train_df.label = train_df.label.map(comp_id2label).map(model.config.label2id)
test_df.premise = test_df.premise.apply(_process_text)
test_df.hypothesis = test_df.hypothesis.apply(_process_text)
train_df["input"] = "premise: " + train_df.premise + "\nhypothesis: " + train_df.hypothesis
test_df["input"] = "premise: " + test_df.premise + "\nhypothesis: " + test_df.hypothesis

train_ds = datasets.Dataset.from_pandas(train_df[["input", "label"]])
test_ds = datasets.Dataset.from_pandas(test_df[["id", "input"]])
train_ds = train_ds.train_test_split(test_size=0.1)

In [None]:
train_ds = train_ds.map(tokfn, batched=True)

In [None]:
bs = 8
epochs = 4
lr = 8e-5

In [None]:
args = TrainingArguments('outputs',
                         learning_rate=lr,
                         warmup_ratio=0.1,
                         lr_scheduler_type='cosine',
                         fp16=True if torch.cuda.is_available() else False,
                         evaluation_strategy="epoch",
                         per_device_train_batch_size=bs,
                         per_device_eval_batch_size=bs*2,
                         num_train_epochs=epochs,
                         weight_decay=0.01,
                         report_to='none')


In [None]:
trainer = Trainer(model, args,
                  train_dataset=train_ds['train'],
                  eval_dataset=train_ds['test'],
                  tokenizer=tokenizer,
                  compute_metrics="accuracy")

In [None]:
trainer.train();

In [None]:
if not iskaggle:
    push_notebook('vempaliakhil96', '03-exp',
                  title='03-exp',
                  file='03-exp.ipynb',
                  competition=comp_name,
                  private=False,
                  gpu=True)