<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_xlm/test_sample_XLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! nvidia-smi

In [None]:
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git

In [None]:
%cd PyTorch-Architectures/modeling_xlm/

In [None]:
! pip install datasets
! pip install transformers

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import XLMTokenizer
from config_xlm import XLMConfig
from model import XLMForSequenceClassification
from datasets import load_dataset

dataset = load_dataset('rotten_tomatoes')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [6]:
train_texts = [_['text'] for _ in dataset['train']]
train_labels = [_['label'] for _ in dataset['train']]

valid_texts = [_['text'] for _ in dataset['validation']]
valid_labels = [_['label'] for _ in dataset['validation']]

In [None]:
# Defining the model and tokenizer
config = XLMConfig()
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMForSequenceClassification(config).to(device)

In [8]:
total_params = sum(p.numel() for p in model.parameters())
print("Total Parameters = ", total_params)

Total Parameters =  106974210


In [11]:
class CustomDataset(Dataset):
  def __init__(self, tokenizer, texts, labels, seq_length=64):
    self.tokenizer = tokenizer
    self.texts = texts
    self.labels = labels
    self.seq_length = seq_length
    self.train_texts = []
    self.build()
  
  def __len__(self):
    return len(self.train_texts)
  
  def __getitem__(self, idx):
    input_ids = self.train_texts[idx]['input_ids']
    attention_mask = self.train_texts[idx]['attention_mask']
    labels = self.train_texts[idx]['labels']
    return{
        'ids': torch.tensor(input_ids, dtype=torch.long),
        'mask': torch.tensor(input_ids, dtype=torch.long),
        'tgt': torch.tensor(labels, dtype=torch.long),
    }
  
  def build(self):
    for text, label in zip(self.texts, self.labels):
      tokens = tokenizer(text, max_length=self.seq_length, truncation=True, padding='max_length')
      self.train_texts.append({'input_ids': tokens['input_ids'], 'attention_mask': tokens['attention_mask'], 'labels': label})

In [12]:
train_dataset = CustomDataset(tokenizer, train_texts, train_labels)
valid_dataset = CustomDataset(tokenizer, valid_texts, valid_labels)

In [18]:
BATCH_SIZE = 8
LR = 0.001
EPOCHS = 5

train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE, num_workers=4)
valid_loader = DataLoader(dataset=valid_dataset, shuffle=False, batch_size=BATCH_SIZE, num_workers=4)
print("Length of Train DataLoader: ", len(train_loader))
print("Length of Valid DataLoader: ", len(valid_loader))

optimizer = torch.optim.Adam(model.parameters(), lr=LR)

Length of Train DataLoader:  1067
Length of Valid DataLoader:  134


In [20]:
for epoch in range(EPOCHS):
  model.train()
  for idx, sample in enumerate(train_loader):
    ids = sample['ids'].to(device)
    mask = sample['mask'].to(device)
    labels = sample['tgt'].to(device)
    
    optimizer.zero_grad()
    logits = model(input_ids=ids, attention_mask=mask, labels=labels)
    loss = logits[0]
    print(loss.item())

    loss.backward()
    optimizer.step()
    break
  break

0.6931471824645996
