<a href="https://colab.research.google.com/github/vempaliakhil96/kaggle-entailment-competition/blob/main/03-exp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [None]:
! pip install fastkaggle fastai pandas fastcore tqdm datasets transformers[torch] accelerate evaluate --quiet

zsh:1: no matches found: transformers[torch]


In [None]:
from fastkaggle import *
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from fastai.text.all import *
from tqdm import tqdm
import pandas as pd
import numpy as np
import evaluate
import datasets
from transformers import TrainingArguments,Trainer
import torch
from torch.utils.data import DataLoader
import os
try: 
    from google.colab import userdata
    os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY'); os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')
except: pass

tqdm.pandas()


In [None]:
if not iskaggle: api = import_kaggle()

In [None]:
comp_name = "contradictory-my-dear-watson"

In [None]:
dpath = setup_comp(comp_name)

In [None]:
dpath.ls()

(#4) [Path('contradictory-my-dear-watson/test.csv'),Path('contradictory-my-dear-watson/models'),Path('contradictory-my-dear-watson/train.csv'),Path('contradictory-my-dear-watson/sample_submission.csv')]

In [None]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
device = "cuda" if torch.cuda.is_available() else device
mname = 'cross-encoder/nli-distilroberta-base'
mname = 'microsoft/deberta-v3-small'
model = AutoModelForSequenceClassification.from_pretrained(mname, num_labels=3).to(device)
tokenizer = AutoTokenizer.from_pretrained(mname)


Downloading config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/286M [00:00<?, ?B/s]

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['pooler.dense.bias', 'classifier.bias', 'pooler.dense.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



In [None]:
def baseline_prediction(premise, hypothesis):
    toks = tokenizer([premise], [hypothesis], return_tensors='pt', padding=True, truncation=True).to(device)
    scores = model(**toks).logits
    label_mapping = ['contradiction', 'entailment', 'neutral']
    label = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
    return label[0]

def bulk_baseline_predict(premises, hypotheses):
    labels = []
    bs = 8
    for i in tqdm(range(0, len(premises), bs)):
        toks = tokenizer(premises[i:i+bs], hypotheses[i:i+bs], return_tensors='pt', padding=True, truncation=True).to(device)
        scores = model(**toks).logits
        label_mapping = ['contradiction', 'entailment', 'neutral']
        _labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
        labels.extend(_labels)
    return labels

In [None]:
comp_id2label = {
    0: "entailment",
    1: "neutral",
    2: "contradiction"
}
label2comp_id = {v:k for k,v in comp_id2label.items()}
model.config.label2id = label2comp_id
model.config.id2label = comp_id2label

In [None]:
tokenizer.model_max_length = 512
bs = 32

# Data Processing

In [None]:
train_df = pd.read_csv(dpath/"train.csv")
test_df = pd.read_csv(dpath/"test.csv")

def _process_text(text): return fix_html(rm_useless_spaces(spec_add_spaces(text)))

def tokfn(x): return tokenizer(x["input"], truncation=True)

train_df.premise = train_df.premise.apply(_process_text)
train_df.hypothesis = train_df.hypothesis.apply(_process_text)
train_df.label = train_df.label.map(comp_id2label).map(model.config.label2id)
test_df.premise = test_df.premise.apply(_process_text)
test_df.hypothesis = test_df.hypothesis.apply(_process_text)
train_df["input"] = "premise: " + train_df.premise + "\nhypothesis: " + train_df.hypothesis
test_df["input"] = "premise: " + test_df.premise + "\nhypothesis: " + test_df.hypothesis

train_ds = datasets.Dataset.from_pandas(train_df[["input", "label"]])
test_ds = datasets.Dataset.from_pandas(test_df[["id", "input"]])
train_ds = train_ds.train_test_split(test_size=0.1)

In [None]:
train_ds = train_ds.map(tokfn, batched=True)

Map:   0%|          | 0/10908 [00:00<?, ? examples/s]

Map:   0%|          | 0/1212 [00:00<?, ? examples/s]

## Eval

In [None]:
p = pipeline("text-classification", "vempaliakhil/my-dear-watson-nli-model", tokenizer=tokenizer, device=device)

In [None]:
p(train_ds["test"][5]["input"])

[{'label': 'contradiction', 'score': 0.990056037902832}]

In [None]:
test_ds[5]

{'id': 'aa2510d454',
 'input': 'premise: His family had lost a son and a daughter now.\nhypothesis: The son and daughter had lost their father.'}

In [None]:
predictions = []
for i in tqdm(range(0, len(test_ds), bs)): predictions.extend(p(test_ds["input"][i:i+bs]))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 163/163 [06:35<00:00,  2.42s/it]


In [None]:
if "prediction" in test_ds.features: test_ds = test_ds.remove_columns(["prediction"])
test_ds = test_ds.add_column("prediction", [o["label"] for o in predictions])

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [None]:
test_ds = test_ds.map(lambda x: dict(prediction=label2comp_id[x["prediction"]]))

Map:   0%|          | 0/5195 [00:00<?, ? examples/s]

In [None]:
test_df = test_ds.to_pandas(); test_df.head(); test_df[["id", "prediction"]].to_csv("submission.csv", index=False)

In [None]:
if not iskaggle:
    push_notebook('vempaliakhil96', '03-exp-inf',
                  title='03-exp-inf',
                  file='03-exp-inf.ipynb',
                  competition=comp_name,
                  private=False,
                  gpu=True)