In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt

import transformers
from transformers import AutoModelForSequenceClassification,XLMRobertaTokenizer, get_scheduler
from datasets import load_dataset

import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
transformers.logging.set_verbosity_error()

In [3]:
class CFG:
    batch_size = 64
    sequence_length = 128
    epochs = 4
    learning_rate = 1e-5
    weight_decay = 0.01

In [4]:
train_data = pd.read_csv("../input/contradictory-my-dear-watson/train.csv")
test_data = pd.read_csv("../input/contradictory-my-dear-watson/test.csv")
train_data.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [5]:
model_path = 'joeddav/xlm-roberta-large-xnli'
tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)



In [6]:
#PATH = '/kaggle/input/xlm-roberta-large-xnli-fine-tuned/pytorch/default/1/BERT_ft_epochmodel.model'
#model.load_state_dict(torch.load(PATH, weights_only=True))

In [7]:
class ContradictoryDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.df = df
        self.tokens = tokenizer(
            df['premise'].tolist(),
            df['hypothesis'].tolist(),
            max_length = CFG.sequence_length,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        if 'label' in df.keys():
          self.labels = torch.tensor(df['label'].tolist())


    def __len__(self):
        return self.tokens["input_ids"].shape[0]

    def __getitem__(self, idx):
      if 'label' in self.df.keys():
        return (
            self.tokens["input_ids"][idx], 
            self.tokens["attention_mask"][idx],
            self.labels[idx]
        )
      return (
            self.tokens["input_ids"][idx], 
            self.tokens["attention_mask"][idx],
        )

# https://huggingface.co/datasets/nyu-mll/multi_nli
def load_mnli_dataset():
  mnli_dataset = load_dataset('multi_nli')
  result = []

  for data in mnli_dataset['train']:
    x1, x2, y = data['premise'], data['hypothesis'], data['label']
    if x1 and x2 and y in {0, 1, 2}:
      result.append((x1, x2, y, 'en'))
  df = pd.DataFrame(result, columns=['premise', 'hypothesis', 'label', 'lang_abv'])
  return df

In [None]:
# loading additional multi-genre NLI dataset
mnli_dataset = load_mnli_dataset()

# train/validation/test split
train_data, val_data = train_test_split(train_data, test_size=0.2, random_state=42)
train_data = pd.concat([train_data, mnli_dataset.iloc[:100000]], axis=0)

# creating datasets
train_dataset = ContradictoryDataset(train_data, tokenizer)
val_dataset = ContradictoryDataset(val_data, tokenizer)
test_dataset = ContradictoryDataset(test_data, tokenizer)

# creating dataloaders
train_dataloader = DataLoader(
    train_dataset,
    batch_size=CFG.batch_size,
    num_workers=0
)

val_dataloader = DataLoader(
    val_dataset,
    batch_size=CFG.batch_size,
    num_workers=0
)

test_dataloader = DataLoader(
    test_dataset,
    batch_size=CFG.batch_size,
    num_workers=0
)

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=CFG.learning_rate, weight_decay=CFG.weight_decay)

model.to(device)

In [None]:
def train_one_epoch(model, train_dataloader, criterion, optimizer, epoch):
    train_loss = 0.0
    train_f1 = 0.0
    train_progress_bar = tqdm(train_dataloader, desc="Epoch {:1d}".format(epoch), leave=False, disable=False)
    
    for i, batch in enumerate(train_progress_bar):
      optimizer.zero_grad()
      batch = tuple(b.to(device) for b in batch)
    
      inputs = {
          'input_ids': batch[0],
          'attention_mask': batch[1],
          'labels': batch[2]
      }
          
      outputs = model(**inputs)
      loss = outputs[0]
      logits = outputs[1]
      
    
      logits = logits.detach().cpu().numpy()
      label_ids = inputs['labels'].cpu().numpy()
    
      y_pred = np.argmax(logits, axis=1)
      y_true = label_ids
        
      train_loss += loss.item()
      train_f1 += f1_score(y_true, y_pred, average='weighted')
    
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
      loss.backward()
      optimizer.step()
        
      train_progress_bar.set_postfix({'train_loss': f'{(train_loss / (i + 1)):.4f}', 'train_f1': f'{(train_f1 / (i + 1)):.4f}'})

    return train_loss / len(train_dataloader), train_f1 / len(train_dataloader)


In [None]:
def validate(model, val_dataloader, criterion, epoch):
    val_loss = 0.0
    val_f1 = 0.0
    val_progress_bar = tqdm(val_dataloader, desc="Epoch {:1d}".format(epoch), leave=False, disable=False)

    for i, batch in enumerate(val_progress_bar):
        batch = tuple(b.to(device) for b in batch)
        inputs = {
          'input_ids': batch[0],
          'attention_mask': batch[1],
          'labels': batch[2]
          }

        with torch.no_grad():
          outputs = model(**inputs)
        loss = outputs[0]
        logits = outputs[1]
        
        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()

        y_pred = np.argmax(logits, axis=1).flatten()
        y_true = label_ids

        val_loss += loss.item()
        val_f1 += f1_score(y_true, y_pred, average='weighted')
    
        val_progress_bar.set_postfix({'val_loss': f'{(val_loss / (i + 1)):.4f}', 'val_f1': f'{(val_f1 / (i + 1)):.4f}'})

    return val_loss / len(val_dataloader), val_f1 / len(val_dataloader)

In [None]:
def test(model, test_dataloader):
    model.eval()
    y_pred = []
    test_progress_bar = tqdm(test_dataloader, desc="Test", leave=False, disable=False)

    for batch in test_progress_bar:
        batch = tuple(b.to(device) for b in batch)
        inputs = {
          'input_ids': batch[0],
          'attention_mask': batch[1],
          }

        with torch.no_grad():
          outputs = model(**inputs)
        logits = outputs[0]        
        logits = logits.detach().cpu().numpy()
        y_pred.extend(np.argmax(logits, axis=1)) 

    return y_pred

In [None]:
def train(model, train_dataloader, criterion, optimizer, epochs):
    model.train()
    train_losses = []
    val_losses = []

    train_scores = []
    val_scores = []
    
    for epoch in range(epochs):
      # training mode
      model.train()
      train_loss, train_f1 = train_one_epoch(model, train_dataloader, criterion, optimizer, epoch+1)
      train_losses.append(train_loss)
      train_scores.append(train_f1)

      # validation mode
      model.eval()
      val_loss, val_f1 = validate(model, val_dataloader, criterion, epoch+1)
      val_losses.append(val_loss)
      val_scores.append(val_f1)

      # saving model checkpoints in each epoch
      torch.save(model.state_dict(), 'BERT_MNLI_epoch{}.model'.format(epoch+1))

    return {
        'train_loss': train_losses,
        'val_loss': val_losses,
        'train_score': train_scores,
        'val_score': val_scores,
    }

In [None]:
# training the model and getting the history of loss and f1 score
history = train(model, train_dataloader, criterion, optimizer, CFG.epochs)

In [None]:
# getting test predictions
y_preds = test(model, test_dataloader)
submission = test_data.id.copy().to_frame()
submission['prediction'] = y_preds
submission.to_csv("submission.csv", index=False)
submission.head()

In [None]:
submission = test_data.id.copy().to_frame()
submission['prediction'] = preds
submission.to_csv("submission.csv", index=False)
submission.head()