In [2]:
import torch
import pandas as pd
import numpy as np
import math
import os
import json
import random
from transformers import XLMRobertaTokenizer, XLMRobertaForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModel, AutoTokenizer

In [6]:
MODEL_NAME_OR_PATH = "markussagen/xlm-roberta-longformer-base-4096"

tokenizer = XLMRobertaTokenizer.from_pretrained(MODEL_NAME_OR_PATH)



In [7]:
model = XLMRobertaForSequenceClassification.from_pretrained(
    MODEL_NAME_OR_PATH, 
    num_labels = 5,
)

Downloading:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Some weights of the model checkpoint at markussagen/xlm-roberta-longformer-base-4096 were not used when initializing XLMRobertaForSequenceClassification: ['roberta.encoder.layer.7.attention.self.query_global.bias', 'roberta.encoder.layer.2.attention.self.value_global.weight', 'roberta.encoder.layer.8.attention.self.key_global.weight', 'roberta.encoder.layer.2.attention.self.key_global.bias', 'roberta.encoder.layer.2.attention.self.query_global.bias', 'roberta.encoder.layer.0.attention.self.query_global.weight', 'roberta.encoder.layer.9.attention.self.value_global.weight', 'roberta.encoder.layer.10.attention.self.key_global.bias', 'roberta.encoder.layer.4.attention.self.value_global.weight', 'roberta.encoder.layer.7.attention.self.query_global.weight', 'roberta.encoder.layer.2.attention.self.query_global.weight', 'roberta.encoder.layer.5.attention.self.value_global.bias', 'lm_head.layer_norm.bias', 'roberta.encoder.layer.5.attention.self.key_global.weight', 'roberta.encoder.layer.2.atte

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [11]:
df_train = pd.read_csv(r'C:\Users\youss\Downloads\Train_data.csv')
df_test = pd.read_csv(r'C:\Users\youss\Downloads\Test_data.csv')

In [12]:
df_train['Overall'] = df_train['Overall'].astype(int)

In [13]:
L_RATE = 5e-6
MAX_LEN = 4096

NUM_EPOCHS = 2
BATCH_SIZE = 4

In [14]:
class CustomTextDataset(Dataset):
    def __init__(self, txt1, txt2, labels):
        self.labels = labels
        self.text1 = txt1
        self.text2 = txt2
    def __len__(self):
        return len(self.labels)
    def __getitem__(self, idx):
        label = self.labels[idx]
        text1 = self.text1[idx]
        text2 = self.text2[idx]
        encoded_dict = tokenizer.encode_plus(
            str(text1), str(text2),
            add_special_tokens = True,
            max_length = MAX_LEN,     
            pad_to_max_length = True,
            return_attention_mask = True,   
            return_tensors = 'pt' # return pytorch tensors
       )
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]
        target = torch.tensor(label, dtype=torch.long)

        sample = {"Padded Token List": padded_token_list, "Attention Mask": att_mask, "Class": target}
        return sample

In [15]:
class CustomTestDataset(Dataset):
    def __init__(self, txt1, txt2):
        self.text1 = txt1
        self.text2 = txt2
    def __len__(self):
        return len(self.text1)
    def __getitem__(self, idx):
        text1 = self.text1[idx]
        text2 = self.text2[idx]
        encoded_dict = tokenizer.encode_plus(
            str(text1), str(text2),
            add_special_tokens = True,
            max_length = MAX_LEN,     
            pad_to_max_length = True,
            return_attention_mask = True,   
            return_tensors = 'pt' # return pytorch tensors
       )
        padded_token_list = encoded_dict['input_ids'][0]
        att_mask = encoded_dict['attention_mask'][0]

        sample = {"Padded Token List": padded_token_list, "Attention Mask": att_mask}
        return sample

In [16]:
TrainData = CustomTextDataset(df_train['text1'], df_train['text2'], df_train['Overall'])
TestData = CustomTestDataset(df_test['text1'], df_test['text2'])

In [17]:
DL_DS = DataLoader(TrainData, batch_size=BATCH_SIZE, shuffle=True)
DL_DS1 = DataLoader(TestData, batch_size=BATCH_SIZE, shuffle=False)

In [18]:
model.to(device)

XLMRobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(250002, 768, padding_idx=1)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (L

In [19]:
batch = next(iter(DL_DS))

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [20]:
b_input_ids = batch['Padded Token List'].to(device)
b_input_mask = batch['Attention Mask'].to(device)
b_labels = batch['Class'].to(device)

Padded Token List torch.Size([4, 4096])
Attention Mask torch.Size([4, 4096])
Class torch.Size([4])


In [21]:
optimizer = torch.optim.AdamW(model.parameters(),
              lr = L_RATE, 
              eps = 1e-8 
            )

In [22]:
%%time


# Set the seed.
seed_val = 101

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

# Store the average loss after each epoch so we can plot them.
loss_values = []


# For each epoch...
for epoch in range(0, NUM_EPOCHS):
    
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch + 1, NUM_EPOCHS))
    

    stacked_val_labels = []
    targets_list = []

    # ========================================
    #               Training
    # ========================================
    
    print('Training...')
    
    # put the model into train mode
    model.train()
    
    # This turns gradient calculations on and off.
    torch.set_grad_enabled(True)


    # Reset the total loss for this epoch.
    total_train_loss = 0

    for i, batch in enumerate(DL_DS):
        
        train_status = 'Batch ' + str(i) + ' of ' + str(len(DL_DS))
        
        print(train_status, end='\r')


        b_input_ids = batch['Padded Token List'].to(device)
        b_input_mask = batch['Attention Mask'].to(device)
        b_labels = batch['Class'].to(device)

        model.zero_grad()        


        outputs = model(b_input_ids, 
                    attention_mask=b_input_mask,
                    labels=b_labels)
        
        # Get the loss from the outputs tuple: (loss, logits)
        loss = outputs[0]
        
        # Convert the loss from a torch tensor to a number.
        # Calculate the total loss.
        total_train_loss = total_train_loss + loss.item()
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        
        
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        
        
        
        # Use the optimizer to update the weights.
        
        
        optimizer.step() 
        
    
    print('Train loss:' ,total_train_loss)

    # Save the Model
    torch.save(model.state_dict(), 'model_long.pt')
    


Training...
Batch 0 of 1241

RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:81] data. DefaultCPUAllocator: not enough memory: you tried to allocate 3221225472 bytes.

In [None]:
for j, batch in enumerate(DL_DS1):
        
        inference_status = 'Batch ' + str(j+1) + ' of ' + str(len(DL_DS1))
        
        print(inference_status, end='\r')

        b_input_ids = batch['Padded Token List'].to(device)
        b_input_mask = batch['Attention Mask'].to(device)


        outputs = model(b_input_ids, 
                attention_mask=b_input_mask)
        
        
        # Get the preds
        preds = outputs[0]


        # Move preds to the CPU
        preds = preds.detach().cpu().numpy()
        
        # Stack the predictions.

        if j == 0:  # first batch
            stacked_preds = preds

        else:
            stacked_preds = np.vstack((stacked_preds, preds))

In [None]:
preds = np.argmax(stacked_preds, axis=1)

In [None]:
new = pd.DataFrame()
new['pair_id'] = df_test['pair_id']
new['Overall'] = preds.tolist()

In [None]:
new.to_csv('xlmroberta_submission.csv', index=False)