<a href="https://colab.research.google.com/github/vishal-burman/PyTorch-Architectures/blob/master/modeling_bert/test_sample_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! nvidia-smi

In [None]:
! pip install transformers

In [None]:
#! rm -rf PyTorch-Architectures/
! git clone https://github.com/vishal-burman/PyTorch-Architectures.git

In [4]:
! cp /content/drive/'My Drive'/dataset.csv .

In [5]:
! cp dataset.csv PyTorch-Architectures/modeling_bert/

In [6]:
%cd PyTorch-Architectures/modeling_bert/

/content/PyTorch-Architectures/modeling_bert


In [8]:
import time
import csv
import sys
import pdb
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from model import BertClassify
from transformers import BertTokenizer
from config_bert import BertConfig
config = BertConfig()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [9]:
#########################################
# Sample data code
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.tokenizer = tokenizer
        self.texts = texts
        self.labels = labels
        self.train_list = []
        self.label_list = []
        self.build()

    def __len__(self):
        return len(self.train_list)

    def __getitem__(self, index):
        ids = self.train_list[index]['input_ids']
        mask = self.train_list[index]['attention_mask']
        target = self.label_list[index]

        return{
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(ids, dtype=torch.long),
                'target': torch.tensor(target, dtype=torch.long).unsqueeze(0)
                }

    def build(self):
        for t, l in zip(self.texts, self.labels):
            self.train_list.append(tokenizer(t, max_length=128, pad_to_max_length=True, truncation=True))
            self.label_list.append(l)
##########################################


In [10]:
texts = []
labels = []
with open("dataset.csv", "r") as file_1:
    reader = csv.reader(file_1)
    for line in reader:
        texts.append(line[0].strip())
        labels.append(line[1].strip())

texts = texts[1:]
labels = labels[1:]

labels = [1 if label == "positive" else 0 for label in labels]

texts_train = texts[:45000]
labels_train = labels[:45000]

texts_valid = texts[45000:]
labels_valid = labels[45000:]

start_time = time.time()
train_dataset = CustomDataset(texts_train, labels_train, tokenizer)
valid_dataset = CustomDataset(texts_valid, labels_valid, tokenizer)
print("Dataset Conversion Done!!")
print("Time Taken = ", (time.time() - start_time)/60)



Dataset Conversion Done!!
Time Taken =  4.4791104634602865


In [11]:
BATCH_SIZE = 64
LEARNING_RATE = 5e-05
EPOCHS = 5

In [12]:
train_loader = DataLoader(dataset=train_dataset, shuffle=True, batch_size=BATCH_SIZE)
valid_loader = DataLoader(dataset=valid_dataset, shuffle=False, batch_size=BATCH_SIZE)
print("Total train batches = ", len(train_loader))
print("Total valid batches = ", len(valid_loader))

Total train batches =  704
Total valid batches =  79


In [14]:
def init_weights(module):
  if isinstance(module, (nn.Linear, nn.Embedding)):
    module.weight.data.normal_(mean=0.0, std=0.02)
  elif isinstance(module, nn.LayerNorm):
      module.bias.data.zero_()
      module.weight.data.fill_(1.0)
  if isinstance(module, nn.Linear) and module.bias is not None:
      module.bias.data.zero_()

model = BertClassify(config)
model.apply(init_weights)
model = model.to(device)
pytorch_total_params = sum(p.numel() for p in model.parameters())
print("Total Parameters = ", pytorch_total_params)

optimizer = torch.optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

Total Parameters =  66956546


In [15]:
def compute_accuracy(model, data_loader, device):
    correct_pred, num_examples = 0, 0
    model.eval()
    for idx, sample in enumerate(data_loader):
        ids = sample['ids'].to(device)
        mask = sample['mask'].to(device)
        target = sample['target'].to(device)

        output = model(input_ids=ids, attention_mask=mask)
        logits = output[0]
        probas = F.softmax(logits, dim=1)
        _, predicted_labels = torch.max(probas, 1)
        num_examples += target.size(0)
        correct_pred += (predicted_labels.unsqueeze(1) == target).sum()
    return correct_pred.float()/num_examples*100
        

start_time = time.time()
for epoch in range(EPOCHS):
    model.train()
    for idx, sample in enumerate(train_loader):
        ids = sample['ids'].to(device)
        mask = sample['mask'].to(device)
        target = sample['target'].to(device)
        
        optimizer.zero_grad()
        output = model(input_ids=ids, attention_mask=mask, labels=target)
        loss = output[0]

        # LOGGING
        if idx % 100 == 0:
            print("Epoch: %05d || Batch: %05d || Loss: %.3f" % (epoch+1, idx, loss.item()))
        
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.set_grad_enabled(False):

        train_acc = compute_accuracy(model, train_loader, device)
        valid_acc = compute_accuracy(model, valid_loader, device)

        print("Train Accuracy = ", train_acc)
        print("Valid Accuracy = ", valid_acc)

    elapsed_time = (time.time() - start_time) / 60
    print("Elapsed Time: ", elapsed_time)

Epoch: 00001 || Batch: 00000 || Loss: 0.716
Epoch: 00001 || Batch: 00100 || Loss: 0.684
Epoch: 00001 || Batch: 00200 || Loss: 0.703
Epoch: 00001 || Batch: 00300 || Loss: 0.694
Epoch: 00001 || Batch: 00400 || Loss: 0.671
Epoch: 00001 || Batch: 00500 || Loss: 0.665
Epoch: 00001 || Batch: 00600 || Loss: 0.674
Epoch: 00001 || Batch: 00700 || Loss: 0.682
Train Accuracy =  tensor(53.9356, device='cuda:0')
Valid Accuracy =  tensor(53.2400, device='cuda:0')
Elapsed Time:  6.72454807360967
Epoch: 00002 || Batch: 00000 || Loss: 0.690
Epoch: 00002 || Batch: 00100 || Loss: 0.648
Epoch: 00002 || Batch: 00200 || Loss: 0.672
Epoch: 00002 || Batch: 00300 || Loss: 0.652
Epoch: 00002 || Batch: 00400 || Loss: 0.686
Epoch: 00002 || Batch: 00500 || Loss: 0.630
Epoch: 00002 || Batch: 00600 || Loss: 0.665
Epoch: 00002 || Batch: 00700 || Loss: 0.650
Train Accuracy =  tensor(62.9733, device='cuda:0')
Valid Accuracy =  tensor(57.6200, device='cuda:0')
Elapsed Time:  13.445187520980834
Epoch: 00003 || Batch: 000

In [None]:
# Further improvements can be done!!