<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_openai/test_sample_OpenAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git

In [7]:
%cd PyTorch-Architectures/modeling_openai/

/content/PyTorch-Architectures/modeling_openai


In [None]:
! pip install transformers
! pip install ftfy
! pip install spacy
! pip install datasets

In [21]:
import time
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import OpenAIGPTTokenizer
from model import OpenAIGPTLMHeadModel
from utils import Conv1D
from config_openai import OpenAIGPTConfig
config = OpenAIGPTConfig()
tokenizer = OpenAIGPTTokenizer.from_pretrained("openai-gpt")
# pad_token is not set by default
tokenizer.pad_token = '[PAD]'
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [9]:
from datasets import load_dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')

Reusing dataset cnn_dailymail (/root/.cache/huggingface/datasets/cnn_dailymail/3.0.0/3.0.0/0128610a44e10f25b4af6689441c72af86205282d26399642f7db38fa7535602)


In [10]:
##############################################
# PyTorch Dataset
class CustomDataset(Dataset):

    def __init__(self, texts, tokenizer):
        self.texts = texts
        self.tokenizer = tokenizer
        self.list_texts = []
        self.build()

    def __len__(self):
        return len(self.list_texts)

    def __getitem__(self, index):
        ids = self.list_texts[index]['input_ids']
        mask = self.list_texts[index]['attention_mask']
        
        return{
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long)
                }

    def build(self):
        for text in self.texts:
            self.list_texts.append(self.tokenizer(text, max_length=128, padding='max_length', truncation=True))
################################################

In [11]:
texts_train = dataset['train']['article'][:20000]
texts_valid = dataset['validation']['article'][:2000]

start_time = time.time()
train_dataset = CustomDataset(texts_train, tokenizer)
valid_dataset = CustomDataset(texts_valid, tokenizer)
print("Time Taken = ", (time.time() - start_time) / 60)

Time Taken =  4.11614770491918


In [12]:
train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=8, num_workers=4)
valid_loader = DataLoader(dataset=valid_dataset, shuffle=False, batch_size=8, num_workers=4)
print("Training Samples = ", len(train_loader))
print("Validation Samples = ", len(valid_loader))

Training Samples =  2500
Validation Samples =  250


In [22]:
model = OpenAIGPTLMHeadModel(config)
def _init_weights(module):
  if isinstance(module, (nn.Linear, nn.Embedding, Conv1D)):
      module.weight.data.normal_(mean=0.0, std=0.02)
      if isinstance(module, (nn.Linear, Conv1D)) and module.bias is not None:
          module.bias.data.zero_()
  elif isinstance(module, nn.LayerNorm):
      module.bias.data.zero_()
      module.weight.data.fill_(1.0)
model.apply(_init_weights)
model = model.to(device)
total_params = sum(p.numel() for p in model.parameters())
print("Total Parameters = ", total_params)

Total Parameters =  105094656


In [15]:
EPOCHS = 5
LEARNING_RATE = 3e-5
optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

In [16]:
def compute_loss(model, data_loader, device):
  total_loss = 0
  model.eval()
  for sample in data_loader:
    ids = sample['ids'].to(device)
    mask = sample['mask'].to(device)

    outputs = model(input_ids=ids, attention_mask=mask, labels=ids)
    loss = outputs[0]
    total_loss += loss.item()
  average_loss = total_loss / len(data_loader)
  return average_loss

start_time = time.time()
for epoch in range(EPOCHS):
  model.train()
  for idx, sample in enumerate(train_loader):
    ids = sample['ids'].to(device)
    mask = sample['mask'].to(device)

    optimizer.zero_grad()
    outputs = model(input_ids=ids, attention_mask=mask, labels=ids)
    loss = outputs[0]

    # LOGGING
    if idx % 500 == 0:
      print("Batch: %04d/%04d || Epoch: %03d/%03d || Loss: %.3f" % (idx, len(train_loader), epoch, EPOCHS, loss.item()))
    
    loss.backward()
    optimizer.step()

  model.eval()
  with torch.set_grad_enabled(False):
    valid_loss = compute_loss(model, valid_loader, device)
    print("Validation Average Loss: ", valid_loss)
  
  epoch_elapsed_time = (time.time() - start_time) / 60
  print("Epoch Elapsed Time: ", epoch_elapsed_time)
total_elapsed_time = (time.time() - start_time) / 60
print("Total Training Elapsed Time: ", total_elapsed_time)

Batch: 0000/2500 || Epoch: 000/005 || Loss: 10.787
Batch: 0500/2500 || Epoch: 000/005 || Loss: 6.876
Batch: 1000/2500 || Epoch: 000/005 || Loss: 6.932
Batch: 1500/2500 || Epoch: 000/005 || Loss: 6.204
Batch: 2000/2500 || Epoch: 000/005 || Loss: 6.247
Validation Average Loss:  6.36370386505127
Epoch Elapsed Time:  8.491429289182028
Batch: 0000/2500 || Epoch: 001/005 || Loss: 6.227
Batch: 0500/2500 || Epoch: 001/005 || Loss: 5.949
Batch: 1000/2500 || Epoch: 001/005 || Loss: 6.295
Batch: 1500/2500 || Epoch: 001/005 || Loss: 5.695
Batch: 2000/2500 || Epoch: 001/005 || Loss: 5.676
Validation Average Loss:  6.002708009719848
Epoch Elapsed Time:  17.0772571961085
Batch: 0000/2500 || Epoch: 002/005 || Loss: 5.313
Batch: 0500/2500 || Epoch: 002/005 || Loss: 5.680
Batch: 1000/2500 || Epoch: 002/005 || Loss: 5.416
Batch: 1500/2500 || Epoch: 002/005 || Loss: 5.358
Batch: 2000/2500 || Epoch: 002/005 || Loss: 5.644
Validation Average Loss:  5.781677068710327
Epoch Elapsed Time:  25.670752584934235
B

In [18]:
# Testing the model TODO (Needs more training as of now)
import torch.nn.functional as F
text = ["there is a"]
dataset = CustomDataset(text, tokenizer)
data_loader = DataLoader(dataset=dataset, shuffle=False, batch_size=1)
model.eval()
with torch.set_grad_enabled(False):
  for sample in data_loader:
    ids = sample['ids'].to(device)
    mask = sample['mask'].to(device)
    outputs = model(input_ids=ids, attention_mask=mask)
    logits = outputs[0]
    logits = logits.view(-1, logits.size(-1))
    # print(logits.shape)
    probas = F.softmax(logits, dim=1)
    _, predicted_logit = torch.max(logits, 1)
    predicted_logit = predicted_logit[2]
    print("Original text= ",text[0], "\npredicted_next_token --> ", tokenizer.decode([predicted_logit]))

Original text=  there is a 
predicted_next_token -->  lot
