In [1]:
import pip
for package in ["sentencepiece", "transformers", "torch"]:
  try:
    __import__('imp').find_module(package)
    print(f"Found {package}")
  except ImportError:
    pip.main(['install', package]) 

import requests
from zipfile import ZipFile
from io import BytesIO
from typing import *
import pandas as pd
import numpy as np
from os import listdir, path

import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader, SequentialSampler, RandomSampler, random_split
from torch.nn.utils.rnn import pad_sequence
from transformers import BertTokenizer, BertForSequenceClassification, AdamW


Found sentencepiece
Found transformers
Found torch


In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
class DatasetFromZip(object):
    def __init__(self, url: str):
        self.url = url
    
    def extract(self, output_path: str = ""):
        self.output_path = output_path
        r = requests.get(self.url)
        zipped = ZipFile(BytesIO(r.content))
        zipped.extractall(output_path)
    
    def load_as_pandas(self):
        raise NotImplementedError

    def generate_data(self):
        raise NotImplementedError


class MNLIDataset(DatasetFromZip):
    def __init__(self):
        super().__init__(url = "https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip")
        self.class_names = ["entailment", "neutral", "contradiction"]
        self.class_map = dict([idx for idx in enumerate(self.class_names)]) #idx to name
        self.class_map_inv = {v: k for k, v in self.class_map.items()} #name to idx
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
        self.cls_token = [self.tokenizer.cls_token_id]
        self.sep_token = [self.tokenizer.sep_token_id]

    def extract(self, output_path: str = ""):
        self.data_files = listdir(output_path) if output_path else listdir()

        if "multinli_1.0" not in self.data_files:
            super().extract(output_path)
        elif len(listdir("multinli_1.0")) < 9:
            super().extract(output_path)

        self.data_path = path.join(output_path, "multinli_1.0")
        self.data_files = listdir(self.data_path)
        self.train_path = path.join(self.data_path, "multinli_1.0_train.txt")

    def load_as_pandas(self, train_data_only: bool = False) -> pd.DataFrame:
        """
        Note: Loads Entire Dataset in memory. Uses about 1.5GB of RAM
        """
        with open(self.train_path, "r") as f:
            df = pd.DataFrame([row.split("\t") for row in f.read().split("\n")])
        df = df.rename(columns=df.iloc[0]).drop(df.index[0])
        df = df[~df.sentence1.isna() | ~df.sentence2.isna()]
        df = df[df.gold_label != ""].reset_index(drop=True)
        if train_data_only:
            return df[["gold_label", "sentence1", "sentence2"]]
        return df

    def __bert_encode(self, text: str, special_tokens: bool = False) -> list:
        return self.tokenizer.encode(text, add_special_tokens = special_tokens)
        
    def __pad(self, seq: list) -> torch.Tensor:
        return pad_sequence(seq, batch_first=True)

    def generate_data(self, val_split_perc: float, batch_size: int):
        """
        Generates both Train and Val Datasets via torch Dataloader
        """
        token_ids, mask_ids, seg_ids, y = [], [], [], []

        with open(self.train_path, "r") as f:
            data = [row.split("\t") for row in f.read().split("\n")]
        for idx, row in enumerate(data):
            if idx == 0 or idx == len(data) - 1: continue
            label, sent1, sent2 = row[0], row[5], row[6]
            if label not in self.class_names: continue

            sent1, sent2 = self.__bert_encode(sent1), self.__bert_encode(sent2)

            pair = self.cls_token + sent1 + self.sep_token + sent2 + self.sep_token
            premise_len, hypoth_len = len(sent1), len(sent2)
            segment_id = torch.tensor([0] * (premise_len + 2) + [1] * (hypoth_len + 1))
            attention_mask_id = torch.tensor([1] * (premise_len + hypoth_len + 3))

            token_ids.append(torch.tensor(pair))
            seg_ids.append(segment_id)
            mask_ids.append(attention_mask_id)
            y.append(self.class_map_inv[label])

        dataset = TensorDataset(self.__pad(token_ids), self.__pad(mask_ids), self.__pad(seg_ids), torch.tensor(y))
        datalen = len(dataset)
        val_num = int(datalen * val_split_perc)
        train_num = datalen - val_num
        train_data, val_data = random_split(dataset, [train_num, val_num])

        self.train_data = DataLoader(train_data, shuffle=True, batch_size=batch_size)
        self.val_data = DataLoader(val_data, shuffle=True, batch_size=batch_size)



In [4]:
SPLIT_PERC = 0.2
BATCH_SIZE = 16 #can't go higher because of GPU memory limits

builder = MNLIDataset()
builder.extract()
builder.generate_data(val_split_perc = SPLIT_PERC, batch_size=BATCH_SIZE)


In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

In [6]:
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,484,547 trainable parameters


In [None]:
import time


def train(model, train_loader, val_loader, optimizer, EPOCHS):  
  total_step = len(train_loader)

  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)
      
      loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()

      # loss = criterion(prediction, labels)
      acc = multi_acc(prediction, labels)

      loss.backward()
      optimizer.step()
      
      total_train_loss += loss.item()
      total_train_acc  += acc.item()

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)
    model.eval()
    total_val_acc  = 0
    total_val_loss = 0
    with torch.no_grad():
      for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
        optimizer.zero_grad()
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)

        # prediction = model(pair_token_ids, mask_ids, seg_ids)
        loss, prediction = model(pair_token_ids, 
                             token_type_ids=seg_ids, 
                             attention_mask=mask_ids, 
                             labels=labels).values()
        
        acc = multi_acc(prediction, labels)

        total_val_loss += loss.item()
        total_val_acc  += acc.item()

    val_acc  = total_val_acc/len(val_loader)
    val_loss = total_val_loss/len(val_loader)
    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

EPOCHS = 5
train(model, builder.train_data, builder.val_data, optimizer, EPOCHS)

