In [1]:
!git clone https://github.com/vishal-burman/PyTorch-Architectures.git

Cloning into 'PyTorch-Architectures'...
remote: Enumerating objects: 363, done.[K
remote: Counting objects: 100% (363/363), done.[K
remote: Compressing objects: 100% (221/221), done.[K
remote: Total 794 (delta 195), reused 270 (delta 117), pack-reused 431[K
Receiving objects: 100% (794/794), 8.41 MiB | 17.94 MiB/s, done.
Resolving deltas: 100% (471/471), done.


In [2]:
%cd PyTorch-Architectures/modeling_gpt2/

/kaggle/working/PyTorch-Architectures/modeling_gpt2


In [3]:
! pip install transformers
! pip install datasets

Collecting datasets
  Downloading datasets-1.1.2-py3-none-any.whl (147 kB)
[K     |████████████████████████████████| 147 kB 2.1 MB/s eta 0:00:01
[?25hCollecting pyarrow>=0.17.1
  Downloading pyarrow-2.0.0-cp37-cp37m-manylinux2014_x86_64.whl (17.7 MB)
[K     |████████████████████████████████| 17.7 MB 19.2 MB/s eta 0:00:01
Collecting xxhash
  Downloading xxhash-2.0.0-cp37-cp37m-manylinux2010_x86_64.whl (243 kB)
[K     |████████████████████████████████| 243 kB 44.3 MB/s eta 0:00:01
Installing collected packages: pyarrow, xxhash, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 0.16.0
    Uninstalling pyarrow-0.16.0:
      Successfully uninstalled pyarrow-0.16.0
Successfully installed datasets-1.1.2 pyarrow-2.0.0 xxhash-2.0.0


In [8]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from transformers import GPT2Tokenizer
from model import GPT2Classify

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
dataset = load_dataset('imdb')
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2Classify().to(device)

Reusing dataset imdb (/root/.cache/huggingface/datasets/imdb/plain_text/1.0.0/90099cb476936b753383ba2ae6ab2eae419b2e87f71cd5189cb9c8e5814d12a3)


In [5]:
total_params = sum(p.numel() for p in model.parameters())
print("Total Parameters = ", total_params)

Total Parameters =  60650496


In [23]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, list_texts, list_labels, max_seq_length=128):
        self.tokenizer = tokenizer
        self.list_texts = list_texts
        self.list_labels = list_labels
        self.max_seq_length = max_seq_length
        self.list_train = []
        self.build()
    
    def __len__(self):
        return len(self.list_train)
    
    def __getitem__(self, index):
        input_ids = self.list_train[index]['input_ids']
        attention_mask = self.list_train[index]['attention_mask']
        label = self.list_train[index]['label']
        return {
            'ids': torch.tensor(input_ids, dtype=torch.long),
            'mask': torch.tensor(input_ids, dtype=torch.long),
            'label': torch.tensor(label),
        }
    
    def build(self):
        for text, label in zip(self.list_texts, self.list_labels):
            tokens = self.tokenizer(text, max_length=self.max_seq_length, padding='max_length', truncation=True)
            self.list_train.append({'input_ids': tokens['input_ids'], 'attention_mask': tokens['attention_mask'], 'label': label})

In [24]:
texts_train = dataset['train']['text'][:22500]
labels_train = dataset['train']['label'][:22500]

texts_valid = dataset['train']['text'][22500:]
labels_valid = dataset['train']['label'][22500:]

texts_test = dataset['test']['text']
labels_test = dataset['test']['label']

In [25]:
train_dataset = TextDataset(tokenizer, texts_train, labels_train)
valid_dataset = TextDataset(tokenizer, texts_valid, labels_valid)
train_dataset = TextDataset(tokenizer, texts_test, labels_test)

In [26]:
batch_size = 16
train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(dataset=valid_dataset, shuffle=False, batch_size=batch_size * 2)
test_loader = DataLoader(dataset=valid_dataset, shuffle=False, batch_size=batch_size * 2)

print("Length of Train loader: ", len(train_loader))
print("Length of Valid loader: ", len(valid_loader))
print("Length of Test loader: ", len(test_loader))

Length of Train loader:  1563
Length of Valid loader:  79
Length of Test loader:  79


In [29]:
# Check Train Loader:
for sample in train_loader:
    ids = sample['ids']
    mask = sample['mask']
    labels = sample['label']
    print(ids.shape, mask.shape, labels.shape)
    break

torch.Size([16, 128]) torch.Size([16, 128]) torch.Size([16])


In [30]:
learning_rate = 1e-5
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

In [None]:
EPOCHS = 5

def compute_accuracy(model, data_loader, device):
    pass

start_time = time.time()
for epoch in range(EPOCHS):
    model.train()
    for batch_idx, sample in enumerate(train_loader):
        ids = sample['ids'].to(device)
        mask = sample['mask'].to(device)
        labels = sample['label'].to(device)

        outputs = model(input_ids=ids, attention_mask=mask, labels=labels)
        loss = outputs[0]

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        # LOGGING
        if batch_idx % 200 == 0:
            print("Batch: %04d/%04d || Epoch: %04d/%04d" % (batch_idx, len(train_loader), epoch+1, EPOCHS))
    
    model.eval()
    with torch.set_grad_enabled(False):
        # TODO define function
        train_accuracy = compute_accuracy(model, train_loader, device)
        valid_accuracy = compute_accuracy(model, valid_loader, device)
        print("Train Accuracy: %.2f || Valid Accuracy: %.2f" % (train_accuracy, valid_accuracy))
    elapsed_time = (time.time() - start_time) / 60
    print("Epoch Elapsed Time: ", elapsed_time)
elapsed_time = (time.time() - start_time) / 60
print("Total Training Time: ", elapsed_time)