In [None]:
# ! rm -rf PyTorch-Architectures/
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git

In [None]:
%cd PyTorch-Architectures/modeling_xlm/

In [None]:
! pip install datasets
! pip install transformers

In [None]:
from os import path
import time
import csv
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import XLMTokenizer
from config_xlm import XLMConfig
from model import XLMWithLMHeadModel
from datasets import load_dataset
from transformers import top_k_top_p_filtering

dataset = load_dataset('cnn_dailymail', '3.0.0')
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# Defining the model and tokenizer
config = XLMConfig()
config.n_layers = 6
tokenizer = XLMTokenizer.from_pretrained('xlm-mlm-en-2048')
model = XLMWithLMHeadModel(config).to(device)

In [None]:
total_params = sum(p.numel() for p in model.parameters())
print("Total Parameters = ", total_params)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, texts, tokenizer, seq_length=64):
        self.tokenizer = tokenizer
        self.texts = texts
        self.seq_length = seq_length
        self.train_list = []
        self.build()

    def __len__(self):
        return len(self.train_list)

    def __getitem__(self, index):
        ids = self.train_list[index]['input_ids']
        mask = self.train_list[index]['attention_mask']

        return{
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(ids, dtype=torch.long),
                }

    def build(self):
        for t in self.texts:
            self.train_list.append(tokenizer(t, max_length=self.seq_length, pad_to_max_length=True, truncation=True))

In [None]:
# ! rm -rf *.pt

In [None]:
if path.exists('train_dataset.pt') and path.exists('valid_dataset.pt'):
  print("Datasets is already present!")
  train_dataset = torch.load('train_dataset.pt')
  valid_dataset = torch.load('valid_dataset.pt')

else:
  texts_train = dataset['train']['article'][:10000]

  texts_valid = dataset['validation']['article'][:1000]

  start_time = time.time()
  train_dataset = CustomDataset(texts_train, tokenizer, seq_length=256)
  valid_dataset = CustomDataset(texts_valid, tokenizer)
  torch.save(train_dataset, 'train_dataset.pt')
  torch.save(valid_dataset, 'valid_dataset.pt')
  print("Dataset Conversion Done!!")
  print("Time Taken = ", (time.time() - start_time)/60)

In [None]:
BATCH_SIZE = 16
LR = 2e-04
EPOCHS = 15

train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE)
valid_loader = DataLoader(dataset=valid_dataset, shuffle=False, batch_size=BATCH_SIZE)
print("Length of Train DataLoader: ", len(train_loader))
print("Length of Valid DataLoader: ", len(valid_loader))

optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [None]:
def compute_loss(model, data_loader, device):
  total_loss = 0
  model.eval()
  with torch.set_grad_enabled(False):
    for sample in data_loader:
      ids = sample['ids'].to(device)
      mask = sample['mask'].to(device)
      labels = sample['ids'].to(device)

      outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
      loss = outputs[0]
      total_loss += loss.item()
  return (total_loss / len(data_loader))

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for idx, sample in enumerate(train_loader):
    ids = sample['ids'].to(device)
    mask = sample['mask'].to(device)
    labels = sample['ids'].to(device)
    
    optimizer.zero_grad()
    
    logits = model(input_ids=ids, attention_mask=mask, labels=labels)
    loss = logits[0]

    # LOGGING
    if idx % 100 == 0:
      print("Batch: %04d/%04d || Epoch: %03d/%03d" % (idx, len(train_loader), epoch+1, EPOCHS))

    loss.backward()
    optimizer.step()

  model.eval()
  with torch.set_grad_enabled(False):
    train_loss = compute_loss(model, train_loader, device)
    valid_loss = compute_loss(model, valid_loader, device)
    print("Train Loss: %.3f" % (train_loss))
    print("Valid Loss: %.3f" % (valid_loss))
  elapsed_epoch_time = (time.time() - start_time) / 60
  print("Epoch Elapsed Time: %d mins" % (elapsed_epoch_time))
total_training_time = (time.time() - start_time) / 60
print("Total Training Time: %d mins" % (elapsed_epoch_time))

In [None]:
# Testing the language model
# text = "he is"
text = "What is"
input_ids = tokenizer.encode(text, return_tensors='pt')
mask = torch.ones(input_ids.shape)
mask_token_index = torch.where(input_ids == tokenizer.mask_token_id)[1]
model.eval()
with torch.set_grad_enabled(False):

  outputs = model(input_ids=input_ids.to(device), attention_mask=mask.to(device))
  logits = outputs[0]
  logits = logits[:, -1, :]
  filtered_next_token_logits = top_k_top_p_filtering(logits, top_k=5, top_p=1.0)
  probs = F.softmax(filtered_next_token_logits, dim=-1)
  next_token = torch.multinomial(probs, num_samples=1)
  generated = torch.cat([input_ids.to(device), next_token], dim=-1)

  resulting_string = tokenizer.decode(generated.tolist()[0], skip_special_tokens=True)
  print(resulting_string)

In [None]:
# Final Notebook --> For the casual modeling objective it still needs a lot of training