In [None]:
! pip install fastkaggle fastai pandas fastcore tqdm --quiet

In [None]:
from fastkaggle import *
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from fastai.text.all import *
from tqdm import tqdm
import pandas as pd
import torch
from torch.utils.data import DataLoader

tqdm.pandas()

In [None]:
if not iskaggle: api = import_kaggle()

In [None]:
comp_name = "contradictory-my-dear-watson"

In [None]:
dpath = setup_comp(comp_name)

In [None]:
dpath.ls()

(#4) [Path('contradictory-my-dear-watson/test.csv'),Path('contradictory-my-dear-watson/models'),Path('contradictory-my-dear-watson/train.csv'),Path('contradictory-my-dear-watson/sample_submission.csv')]

In [None]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
device = "cuda" if torch.cuda.is_available() else device
mname = 'cross-encoder/nli-distilroberta-base'
model = AutoModelForSequenceClassification.from_pretrained(mname).to(device)
tokenizer = AutoTokenizer.from_pretrained(mname)


In [None]:
def baseline_prediction(premise, hypothesis):
    toks = tokenizer([premise], [hypothesis], return_tensors='pt', padding=True, truncation=True).to(device)
    scores = model(**toks).logits
    label_mapping = ['contradiction', 'entailment', 'neutral']
    label = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
    return label[0]

def bulk_baseline_predict(premises, hypotheses):
    labels = []
    bs = 8
    for i in tqdm(range(0, len(premises), bs)):
        toks = tokenizer(premises[i:i+bs], hypotheses[i:i+bs], return_tensors='pt', padding=True, truncation=True).to(device)
        scores = model(**toks).logits
        label_mapping = ['contradiction', 'entailment', 'neutral']
        _labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
        labels.extend(_labels)
    return labels

In [None]:
comp_id2label = {
    0: "entailment",
    1: "neutral",
    2: "contradiction"
}
label2comp_id = {v:k for k,v in comp_id2label.items()}

In [None]:
train_df = pd.read_csv(dpath/"train.csv")
test_df = pd.read_csv(dpath/"test.csv")

def _process_text(text):
    text = spec_add_spaces(text)
    text = rm_useless_spaces(text)
    text = fix_html(text)
    return text

train_df.premise = train_df.premise.apply(_process_text)
train_df.hypothesis = train_df.hypothesis.apply(_process_text)
train_df.label = train_df.label.map(comp_id2label)
test_df.premise = test_df.premise.apply(_process_text)
test_df.hypothesis = test_df.hypothesis.apply(_process_text)
train_df["_label"] = bulk_baseline_predict(list(train_df.premise), list(train_df.hypothesis))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1515/1515 [13:56<00:00,  1.81it/s]


In [None]:
train_df.head(2)

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label,_label
0,5130fd2cb5,and these comments were considered in formulating the interim rules.,The rules developed in the interim were put together with these comments in mind.,en,English,entailment,entailment
1,5b72532a0b,"These are issues that we wrestle with in practice groups of law firms, she said.",Practice groups are not permitted to work on these issues.,en,English,contradiction,contradiction


In [None]:
test_df.head(2)

Unnamed: 0,id,premise,hypothesis,lang_abv,language
0,c6d58c3f69,بکس، کیسی، راہیل، یسعیاہ، کیلی، کیلی، اور کولمبین ہائی اسکول کے دوسرے طلبا کے نام سے بکسوں کو نشان زد کیا جائے گا جس نے اس سال پہلے اپنی زندگی کھو دی,"کیسی کے لئے کوئی یادگار نہیں ہوگا, کولمین ہائی اسکول کے طالب علموں میں سے ایک جو مر گیا.",ur,Urdu
1,cefcc82292,هذا هو ما تم نصحنا به.,عندما يتم إخبارهم بما يجب عليهم فعله ، فشلت الإدارة في السماح لنا بالدخول إلى الأسرار التجارية.,ar,Arabic


In [None]:
test_df["_label"] = bulk_baseline_predict(list(test_df.premise), list(test_df.hypothesis))

100%|██████████| 650/650 [00:21<00:00, 29.79it/s]


In [None]:
test_df["prediction"] = test_df._label.map(label2comp_id)

In [None]:
test_df[["id", "prediction"]].to_csv("submission.csv", index=False)

In [None]:
if not iskaggle: 
    push_notebook('vempaliakhil96', 'exp-2',
                  title='exp-2',
                  file='02-exp.ipynb',
                  competition=comp_name, 
                  private=False, 
                  gpu=True)

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label,_label
0,5130fd2cb5,and these comments were considered in formulating the interim rules.,The rules developed in the interim were put together with these comments in mind.,en,English,entailment,entailment
1,5b72532a0b,"These are issues that we wrestle with in practice groups of law firms, she said.",Practice groups are not permitted to work on these issues.,en,English,contradiction,contradiction
2,3931fbe82a,Des petites choses comme celles-là font une différence énorme dans ce que j'essaye de faire.,J'essayais d'accomplir quelque chose.,fr,French,entailment,entailment
3,5622f0c60b,you know they can't really defend themselves like somebody grown uh say my age you know yeah,They can't defend themselves because of their age.,en,English,entailment,entailment
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสดงออกและได้เล่นหลายบทบาทไปพร้อมกัน ๆ อาจช่วยให้เด็กจับความคล้ายคลึงและความแตกต่างระหว่างผู้คนในด้านความปรารถนา ความเชื่อ และความรู้สึกได้,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,neutral,entailment
...,...,...,...,...,...,...,...
12115,2b78e2a914,"The results of even the most well designed epidemiological studies are characterized by this type of uncertainty, though well-designed studies typically report narrower uncertainty bounds around the best estimate than do studies of lesser quality.",All studies have the same amount of uncertainty to them.,en,English,contradiction,contradiction
12116,7e9943d152,"But there are two kinds of the pleasure of doing, and the pleasure of not doing; the pleasure of indulging, and the pleasure of abstinence.","But there are two kinds of the pleasure of doing, and the pleasure of not doing.",en,English,entailment,entailment
12117,5085923e6c,The important thing is to realize that it's way past time to move it.,"It cannot be moved, now or ever.",en,English,contradiction,contradiction
12118,fc8e2fd1fe,At the west end is a detailed model of the whole temple complex.,The model temple complex is at the east end.,en,English,contradiction,contradiction
