In [1]:
!pip install datasets



In [2]:
!unzip contradictory-my-dear-watson.zip -d .

Archive:  contradictory-my-dear-watson.zip
replace ./contradictory-my-dear-watson/sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: 

In [4]:
import numpy as np
import pandas as pd
import transformers
from transformers import AutoModelForSequenceClassification, AutoTokenizer, BertTokenizer, XLMRobertaTokenizer

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from tqdm import tqdm

transformers.logging.set_verbosity_error()

In [5]:
class CFG:
    batch_size = 32
    sequence_length = 128
    epochs = 10

In [6]:
train_data = pd.read_csv('contradictory-my-dear-watson/train.csv')
train_data.head()

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
2,3931fbe82a,Des petites choses comme celles-là font une di...,J'essayais d'accomplir quelque chose.,fr,French,0
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
4,86aaa48b45,ในการเล่นบทบาทสมมุติก็เช่นกัน โอกาสที่จะได้แสด...,เด็กสามารถเห็นได้ว่าชาติพันธุ์แตกต่างกันอย่างไร,th,Thai,1


In [7]:
model_path = 'joeddav/xlm-roberta-large-xnli'
tokenizer = XLMRobertaTokenizer.from_pretrained(model_path)
encoder = AutoModelForSequenceClassification.from_pretrained(model_path)

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
class SequenceDataset(Dataset):
    def __init__(self, df, tokenizer):
        self.tokens = tokenizer(
            df['premise'].tolist(),
            df['hypothesis'].tolist(),
            max_length = CFG.sequence_length,
            truncation=True,
            padding='max_length',
            add_special_tokens=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        self.labels = torch.tensor(df['label'])


    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.tokens["input_ids"][idx],
            "attention_mask": self.tokens["attention_mask"][idx],
            "label": self.labels[idx]
        }


In [9]:
dataset = SequenceDataset(train_data, tokenizer)

In [10]:
train_dataloader = DataLoader(
    dataset,
    batch_size=CFG.batch_size,
    num_workers=0
)

In [12]:
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(encoder.parameters(), lr=1e-5)
encoder.to(device)

epochs = CFG.epochs

encoder.train()
for epoch in tqdm(range(1, epochs + 1)):
  train_loss = 0.0
  progress_bar = tqdm(train_dataloader, desc="Epoch {:1d}".format(epoch), leave=False, disable=False)

  for batch in progress_bar:
      optimizer.zero_grad()
      input_ids = batch['input_ids'].to(device)
      attention_mask = batch['attention_mask'].to(device)
      labels = batch['label'].to(device)



      outputs = encoder(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
      loss = outputs[0]
      train_loss += loss.item()

      torch.nn.utils.clip_grad_norm_(encoder.parameters(), 1.0)
      loss.backward()
      optimizer.step()

      progress_bar.set_postfix({'training_loss': '{:3f}'.format(loss.item() / len(batch))})

  train_loss_avg = train_loss / len(train_dataloader)
  tqdm.write(f'\nEpoch {epoch}')
  tqdm.write(f'Training loss: {train_loss_avg}')


  0%|          | 0/10 [00:00<?, ?it/s]
Epoch 1:   0%|          | 0/379 [00:00<?, ?it/s][A
Epoch 1:   0%|          | 0/379 [00:02<?, ?it/s, training_loss=1.867973][A
Epoch 1:   0%|          | 1/379 [00:02<17:02,  2.71s/it, training_loss=1.867973][A
Epoch 1:   0%|          | 1/379 [00:04<17:02,  2.71s/it, training_loss=1.686862][A
Epoch 1:   1%|          | 2/379 [00:04<14:12,  2.26s/it, training_loss=1.686862][A
Epoch 1:   1%|          | 2/379 [00:06<14:12,  2.26s/it, training_loss=1.240240][A
Epoch 1:   1%|          | 3/379 [00:06<13:20,  2.13s/it, training_loss=1.240240][A
Epoch 1:   1%|          | 3/379 [00:08<13:20,  2.13s/it, training_loss=0.977687][A
Epoch 1:   1%|          | 4/379 [00:08<12:55,  2.07s/it, training_loss=0.977687][A
Epoch 1:   1%|          | 4/379 [00:10<12:55,  2.07s/it, training_loss=0.841028][A
Epoch 1:   1%|▏         | 5/379 [00:10<12:40,  2.03s/it, training_loss=0.841028][A
Epoch 1:   1%|▏         | 5/379 [00:12<12:40,  2.03s/it, training_loss=0.7767

KeyboardInterrupt: 