<a href="https://colab.research.google.com/github/zhestyatsky/bachelor-degree-research/blob/main/src/xlm_roberta_r-bert_like.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%writefile requirements.txt
pytorch-lightning
scikit-learn
transformers
sentencepiece

Overwriting requirements.txt


In [2]:
!pip install --upgrade -r requirements.txt

Requirement already up-to-date: pytorch-lightning in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 1)) (1.1.4)
Requirement already up-to-date: scikit-learn in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 2)) (0.24.0)
Requirement already up-to-date: transformers in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 3)) (4.2.1)
Requirement already up-to-date: sentencepiece in /usr/local/lib/python3.6/dist-packages (from -r requirements.txt (line 4)) (0.1.95)


In [3]:
!mkdir data && git clone https://github.com/SapienzaNLP/mcl-wic.git data && cd data && unzip 'SemEval-2021_MCL-WiC_all-datasets.zip' && rm *.zip && ls

mkdir: cannot create directory ‘data’: File exists


In [4]:
import json
import pandas as pd

def read_train_data():
    with open('data/MCL-WiC/training/training.en-en.data') as f:
        df_train = pd.DataFrame(json.load(f))
    with open('data/MCL-WiC/training/training.en-en.gold') as f:
        df_train = df_train.merge(pd.DataFrame(json.load(f)))

    df_train["tag"] = df_train["tag"].replace({"T": 1, "F": 0})
    
    return df_train


df_train = read_train_data()
print(df_train.shape)
df_train.head()

(8000, 10)


Unnamed: 0,id,lemma,pos,sentence1,sentence2,start1,end1,start2,end2,tag
0,training.en-en.0,play,NOUN,In that context of coordination and integratio...,A musical play on the same subject was also st...,69,73,10,14,0
1,training.en-en.1,play,NOUN,In that context of coordination and integratio...,"In schools, when water is needed, it is girls ...",69,73,112,116,0
2,training.en-en.2,esteem,NOUN,We would also like to convey our esteem and co...,"Father Lini said that, because of that, the Un...",33,39,106,112,1
3,training.en-en.3,esteem,NOUN,We would also like to convey our esteem and co...,This attests to the esteem and trust enjoyed b...,33,39,20,26,1
4,training.en-en.4,holder,NOUN,This growth is the direct result of the increa...,A person may be either the holder of an option...,74,81,27,33,1


In [5]:
import glob


def read_dev_english_data():
    df_dev = pd.DataFrame()
    for file in glob.glob('data/MCL-WiC/dev/multilingual/dev.en-en.data'):
        with open(file, 'r') as f:
          df_dev = df_dev.append(pd.DataFrame(json.load(f)))

    df_dev_tags = pd.DataFrame()
    for file in glob.glob('data/MCL-WiC/dev/multilingual/dev.en-en.gold'):
        with open(file, 'r') as f:
            df_dev_tags = df_dev_tags.append(pd.DataFrame(json.load(f)))

    df_dev = df_dev.merge(df_dev_tags)
    df_dev["tag"] = df_dev["tag"].replace({"T": 1, "F": 0})
    
    return df_dev

df_dev = read_dev_english_data()
print(df_dev.shape)
df_dev.head()

(1000, 10)


Unnamed: 0,id,lemma,pos,sentence1,sentence2,start1,end1,start2,end2,tag
0,dev.en-en.0,superior,NOUN,No clause in a contract shall be interpreted a...,While fully aware that bishops and major super...,78,87,41,50,0
1,dev.en-en.1,superior,NOUN,No clause in a contract shall be interpreted a...,"In Senegal too, the customs officer and his su...",78,87,44,53,1
2,dev.en-en.2,acquaintance,NOUN,Such acquaintance is a right and not an obliga...,The complaints tend to be lodged against acqua...,5,17,41,54,0
3,dev.en-en.3,acquaintance,NOUN,Such acquaintance is a right and not an obliga...,Sexual violence by non-partners refers to viol...,5,17,74,86,0
4,dev.en-en.4,baggage,NOUN,Where any baggage of any passenger contains fi...,In my baggage I had a Hungarian grammar book a...,10,17,6,13,1


In [6]:
from sklearn.model_selection import train_test_split

df_val, df_test = train_test_split(df_dev, test_size=0.2, random_state=1)
print(df_val.shape, df_test.shape)
print(df_val.head()) 
print(df_test.head())

(800, 10) (200, 10)
                id      lemma   pos  ... start2 end2 tag
382  dev.en-en.382       fish  NOUN  ...     28   32   1
994  dev.en-en.994      crash  NOUN  ...     47   52   0
982  dev.en-en.982     filler  NOUN  ...     28   35   0
47    dev.en-en.47  liquidate  VERB  ...     68   78   1
521  dev.en-en.521       roll  NOUN  ...     69   73   1

[5 rows x 10 columns]
                id     lemma   pos  ... start2 end2 tag
507  dev.en-en.507  Georgian   ADJ  ...     15   23   0
818  dev.en-en.818    strain  VERB  ...      0    6   0
452  dev.en-en.452  inclined   ADJ  ...    133  141   1
368  dev.en-en.368   venture  NOUN  ...     39   47   0
242  dev.en-en.242       gag  VERB  ...     62   68   0

[5 rows x 10 columns]


In [7]:
train_labels = df_train["tag"].tolist()
val_labels = df_val["tag"].tolist()
test_labels = df_test["tag"].tolist()

train_sentences = [(s1, s2) for s1, s2 in zip(df_train["sentence1"], df_train["sentence2"])]
val_sentences = [(s1, s2) for s1, s2 in zip(df_val["sentence1"], df_val["sentence2"])]
test_sentences = [(s1, s2) for s1, s2 in zip(df_test["sentence1"], df_test["sentence2"])]

train_word_ranges = [((int(s1), int(e1)), (int(s2), int(e2))) for s1, e1, s2, e2 in zip(df_train["start1"], df_train["end1"], df_train["start2"], df_train["end2"])]
val_word_ranges = [((int(s1), int(e1)), (int(s2), int(e2))) for s1, e1, s2, e2 in zip(df_val["start1"], df_val["end1"], df_val["start2"], df_val["end2"])]
test_word_ranges = [((int(s1), int(e1)), (int(s2), int(e2))) for s1, e1, s2, e2 in zip(df_test["start1"], df_test["end1"], df_test["start2"], df_test["end2"])]

print(train_sentences[0])
print(train_word_ranges[0])

('In that context of coordination and integration, Bolivia holds a key play in any process of infrastructure development.', 'A musical play on the same subject was also staged in Kathmandu for three days.')
((69, 73), (10, 14))


In [8]:
import torch
from torch.utils.data import Dataset, RandomSampler, DataLoader
from transformers import XLMRobertaTokenizerFast

INDICES_PADDING_LEN = 6
INDICES_PADDING_VALUE = 0

class XLMRDataset(Dataset):
    def __init__(self, sentences, word_ranges, labels, max_tokens):
        self.sentences = sentences
        self.word_ranges = word_ranges
        self.labels = labels
        self.tokenizer = XLMRobertaTokenizerFast.from_pretrained("xlm-roberta-large")
        self.max_tokens = max_tokens
    
    def _tokenize(self, sentence):
        return self.tokenizer(sentence,
                              add_special_tokens=True,
                              max_length=self.max_tokens,
                              padding="max_length",
                              truncation=True,
                              return_offsets_mapping=True)
        
    def _get_input_ids_indices_for_word(self, offset_mapping, word_start, word_end):
        indices = []
        for idx, (start, end) in enumerate(offset_mapping):
            if start != end and word_start <= start and end <= word_end:
                indices.append(idx)
            elif word_start < start:
                break

        indices.extend([INDICES_PADDING_VALUE for i in range(INDICES_PADDING_LEN - len(indices))])
        return torch.tensor(indices)
    
    def __len__(self):
        return len(self.sentences)
    
    def __getitem__(self, index):
        first_sentence, second_sentence = self.sentences[index]
        (first_word_start, first_word_end), (second_word_start, second_word_end) = self.word_ranges[index]

        first_input = self._tokenize(first_sentence)
        second_input = self._tokenize(second_sentence)
        
        input_ids = (torch.tensor(first_input["input_ids"]), torch.tensor(second_input["input_ids"]))
        attention_masks = (torch.tensor(first_input["attention_mask"]), torch.tensor(second_input["attention_mask"]))

        first_word_ids_indices = self._get_input_ids_indices_for_word(first_input["offset_mapping"], first_word_start, first_word_end)
        second_word_ids_indices = self._get_input_ids_indices_for_word(second_input["offset_mapping"], second_word_start, second_word_end)

        word_ids_indices = (first_word_ids_indices, second_word_ids_indices)

        return input_ids, attention_masks, word_ids_indices, torch.tensor(self.labels[index], dtype=torch.float)

MAX_TOKENS = 118
BATCH_SIZE = 8
EPOCHS = 4

xlm_train_data = XLMRDataset(train_sentences, train_word_ranges, train_labels, MAX_TOKENS)
xlm_train_sampler = RandomSampler(xlm_train_data)
xlm_train_loader = DataLoader(xlm_train_data, batch_size=BATCH_SIZE, sampler=xlm_train_sampler)

xlm_val_data = XLMRDataset(val_sentences, val_word_ranges, val_labels, MAX_TOKENS)
xlm_val_loader = DataLoader(xlm_val_data, batch_size=BATCH_SIZE)

xlm_test_data = XLMRDataset(test_sentences, test_word_ranges, test_labels, MAX_TOKENS)
xlm_test_loader = DataLoader(xlm_test_data, batch_size=BATCH_SIZE)

TOTAL_STEPS = (len(xlm_train_data) // BATCH_SIZE + 1) * EPOCHS

In [9]:
import os
import torch
import random
import numpy as np


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(42)

In [10]:
#def get_max_tokens(dataset):
#    tokens = 0
#    for item in dataset:
#        attention_masks = item[1]
#        tokens = max(tokens, attention_masks[0].tolist().index(0), attention_masks[1].tolist().index(0))
#    return tokens


#def get_max_offset_mappings(dataset):
#    mappings = 0
#    for item in dataset:
#        word_ids_indices = item[2]
#        mappings = max(mappings, word_ids_indices[0].tolist().index(INDICES_PADDING_VALUE, 1), word_ids_indices[1].tolist().index(INDICES_PADDING_VALUE, 1))
#    return mappings


#print(get_max_tokens(xlm_train_data), get_max_tokens(xlm_val_data), get_max_tokens(xlm_test_data))
#print(get_max_offset_mappings(xlm_train_data), get_max_offset_mappings(xlm_val_data), get_max_offset_mappings(xlm_test_data))

In [11]:
from pytorch_lightning import LightningModule
from pytorch_lightning import Trainer
from pytorch_lightning.metrics import Accuracy
from pytorch_lightning.callbacks import EarlyStopping  
from transformers import XLMRobertaModel, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score 
from torch import nn


def batched_index_select(t, dim, inds):
    dummy = inds.unsqueeze(2).expand(inds.size(0), inds.size(1), t.size(2))
    out = t.gather(dim, dummy) # b x e x f
    return out


def get_mask(indices, embedding_size):
    mask = (indices != INDICES_PADDING_VALUE)
    mask.unsqueeze_(-1)
    mask = mask.expand(mask.shape[0], mask.shape[1], embedding_size)
    LARGE_VALUE = 2**32
    return torch.where(mask == True, 0, LARGE_VALUE)


def get_tokens_embeddings(batch, indices):
    return batched_index_select(batch, 1, indices) - get_mask(indices, batch.shape[2])


class XLMRClassifier(LightningModule):
    def __init__(self, model_path="xlm-roberta-large"):
        super(XLMRClassifier, self).__init__()

        self.model = XLMRobertaModel.from_pretrained(model_path)
        self.embedding_dim = self.model.get_input_embeddings().embedding_dim

        self.first_cls_linear = nn.Linear(self.embedding_dim, self.embedding_dim)
        self.second_cls_linear = nn.Linear(self.embedding_dim, self.embedding_dim)

        self.words_linear = nn.Linear(self.embedding_dim, self.embedding_dim)
        self.final_linear = nn.Linear(4*self.embedding_dim, 1)

        self.dropout = nn.Dropout(p=0.1)
        self.sigmoid = nn.Sigmoid()
        self.loss = nn.BCELoss()

        self.save_hyperparameters() 
        self.valid_accuracy = Accuracy()
        self.test_accuracy = Accuracy()

    def forward(self, input_ids, attention_mask, word_indices):
        first_sentence_outputs = self.model(input_ids[0], attention_mask[0]).last_hidden_state
        second_sentence_outputs = self.model(input_ids[1], attention_mask[1]).last_hidden_state

        first_tokens_embeddings = get_tokens_embeddings(first_sentence_outputs, word_indices[0])
        second_tokens_embeddings = get_tokens_embeddings(second_sentence_outputs, word_indices[1])

        first_word_embedding = torch.max(first_tokens_embeddings, 1)[0]
        second_word_embedding = torch.max(second_tokens_embeddings, 1)[0]

        first_cls_embedding = first_sentence_outputs[:, 0, :]
        second_cls_embedding = second_sentence_outputs[:, 0, :]

        first_word_outputs = self.words_linear(self.dropout(torch.tanh(first_word_embedding)))
        second_word_outputs = self.words_linear(self.dropout(torch.tanh(second_word_embedding)))

        first_cls_outputs = self.first_cls_linear(self.dropout(torch.tanh(first_cls_embedding)))
        second_cls_outputs = self.second_cls_linear(self.dropout(torch.tanh(second_cls_embedding)))

        embeddings = torch.cat((first_word_outputs, second_word_outputs, first_cls_outputs, second_cls_outputs), 1)

        outputs = self.sigmoid(self.final_linear(self.dropout(torch.tanh(embeddings)))).view(-1)
        return outputs

    def training_step(self, batch, _):
        inputs, attn, word_indices, labels = batch
        outputs = self(inputs, attn, word_indices)
        return self.loss(outputs, labels)
    
    def validation_step(self, batch, _):
        inputs, attn, word_indices,labels = batch
        outputs = self(inputs, attn, word_indices)
        logits = (outputs > 0.5).float()

        self.valid_accuracy.update(logits, labels)
        self.log("val_acc", self.valid_accuracy)

        loss = self.loss(outputs, labels)
        self.log("val_loss", loss, prog_bar=True)

    def validation_epoch_end(self, outs):
        self.log("val_acc_epoch", self.valid_accuracy.compute(), prog_bar=True)

    def test_step(self, batch, _):
        inputs, attn, word_indices, labels = batch
        outputs = self(inputs, attn, word_indices)
        logits = (outputs > 0.5).float()

        self.test_accuracy.update(logits, labels)
        self.log("test_acc", self.test_accuracy)

        loss = self.loss(outputs, labels)
        self.log("test_loss", loss, prog_bar=True)

    def test_epoch_end(self, outs):
        self.log("test_acc_epoch", self.test_accuracy.compute(), prog_bar=True)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=1e-5)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=TOTAL_STEPS)
        return [optimizer], [scheduler]


clf = XLMRClassifier()

early_stop_callback = EarlyStopping(
    monitor="val_loss",
    min_delta=0.0,
    patience=8,
    verbose=True,
    mode="min" 
)

trainer = Trainer(
    gpus=1,
    checkpoint_callback=False,
    accumulate_grad_batches=10,
    max_epochs=EPOCHS,
    callbacks=[early_stop_callback],
    val_check_interval=0.5)

trainer.fit(clf, xlm_train_loader, xlm_val_loader)

GPU available: True, used: True
TPU available: None, using: 0 TPU cores
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name              | Type            | Params
------------------------------------------------------
0 | model             | XLMRobertaModel | 559 M 
1 | first_cls_linear  | Linear          | 1.0 M 
2 | second_cls_linear | Linear          | 1.0 M 
3 | words_linear      | Linear          | 1.0 M 
4 | final_linear      | Linear          | 4.1 K 
5 | dropout           | Dropout         | 0     
6 | sigmoid           | Sigmoid         | 0     
7 | loss              | BCELoss         | 0     
8 | valid_accuracy    | Accuracy        | 0     
9 | test_accuracy     | Accuracy        | 0     
------------------------------------------------------
563 M     Trainable params
0         Non-trainable params
563 M     Total params


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validation sanity check', layout=Layout…



HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Training', layout=Layout(flex='2'), max…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Validating', layout=Layout(flex='2'), m…




1

In [12]:
trainer.test(clf, xlm_test_loader)

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Testing', layout=Layout(flex='2'), max=…


--------------------------------------------------------------------------------
DATALOADER:0 TEST RESULTS
{'test_acc': tensor(0.5200, device='cuda:0'),
 'test_acc_epoch': tensor(0.5200, device='cuda:0'),
 'test_loss': tensor(0.6930, device='cuda:0')}
--------------------------------------------------------------------------------


[{'test_acc': 0.5199999809265137,
  'test_acc_epoch': 0.5199999809265137,
  'test_loss': 0.6930343508720398}]

In [None]:
while True:
    x = 0