In [1]:
import pandas as pd
import re
import torch

from torch.utils.data import Dataset, TensorDataset, DataLoader, SequentialSampler, RandomSampler
from torch.nn.utils.rnn import pad_sequence

import pickle
import os
import numpy as np

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [3]:
df = pd.read_json("../data/multinli.jsonl", lines=True)


In [4]:
data = df[['gold_label','sentence1','sentence2']]
train_df = data[0:1000]
val_df = data[1000:1200]

In [5]:
train_df

Unnamed: 0,gold_label,sentence1,sentence2
0,neutral,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...
1,entailment,you know during the season and i guess at at y...,You lose the things to the following level if ...
2,entailment,One of our number will carry out your instruct...,A member of my team will execute your orders w...
3,entailment,How do you know? All this is their information...,This information belongs to them.
4,neutral,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.
...,...,...,...
995,contradiction,so let's see what well what kind of music do y...,Tell me about all the music you love listening...
996,contradiction,"They--hey, what's that?"" He was looking up, a...","He looked down at the ground, while Hanson loo..."
997,contradiction,Or does cold iron ruin your conjuring here? S...,Sather Karf has no questions regarding cold ir...
998,contradiction,5 million Americans living in households with ...,5 millun Americans make too much money


In [6]:
train_df = train_df.dropna()
val_df = val_df.dropna()

In [7]:
train_df['sentence1'] = train_df['sentence1'].astype(str)
train_df['sentence2'] = train_df['sentence2'].astype(str)

In [8]:
val_df['sentence1'] = val_df['sentence1'].astype(str)
val_df['sentence2'] = val_df['sentence2'].astype(str)

In [9]:
train_df = train_df[(train_df['sentence1'].str.split().str.len() > 0) & (train_df['sentence2'].str.split().str.len() > 0)]
val_df = val_df[(val_df['sentence1'].str.split().str.len() > 0) & (val_df['sentence2'].str.split().str.len() > 0)]

In [10]:
train_df

Unnamed: 0,gold_label,sentence1,sentence2
0,neutral,Conceptually cream skimming has two basic dime...,Product and geography are what make cream skim...
1,entailment,you know during the season and i guess at at y...,You lose the things to the following level if ...
2,entailment,One of our number will carry out your instruct...,A member of my team will execute your orders w...
3,entailment,How do you know? All this is their information...,This information belongs to them.
4,neutral,yeah i tell you what though if you go price so...,The tennis shoes have a range of prices.
...,...,...,...
995,contradiction,so let's see what well what kind of music do y...,Tell me about all the music you love listening...
996,contradiction,"They--hey, what's that?"" He was looking up, a...","He looked down at the ground, while Hanson loo..."
997,contradiction,Or does cold iron ruin your conjuring here? S...,Sather Karf has no questions regarding cold ir...
998,contradiction,5 million Americans living in households with ...,5 millun Americans make too much money


In [11]:
val_df

Unnamed: 0,gold_label,sentence1,sentence2
1000,entailment,The Pacific War actually began 70 minutes befo...,"70 minutes prior to the Pearl Harbor attack, t..."
1001,entailment,"The king himself died here in 1598, to be buri...",The king died in 1598 and was buried in a fami...
1002,entailment,The resources required for the installation of...,Installing control technologies in order to re...
1003,neutral,"About 2,500 victims of the Revolutionary guill...",The French Revolution was the last time the Co...
1004,contradiction,Brittany's countryside is wilder and less civi...,Brittany's countryside is boring and mundane.
...,...,...,...
1195,neutral,"First, the Parc de la Villette offers a range ...","The park offers many activities, but participa..."
1196,entailment,Plans are in place to turn the house into a mu...,There are plans to turn the house into a museum.
1197,entailment,Jon ran his rapier through the horse's flank a...,Jon stuck a rapier in the groin of the man.
1198,entailment,Today it's a delightful resort of both modest ...,These days the hotels and villas comprise a be...


In [12]:
import torch
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn.utils.rnn import pad_sequence
import pickle
import os
from transformers import BertTokenizer

class MNLIDataBert(Dataset):

  def __init__(self, train_df, val_df):
    self.label_dict = {'entailment': 0, 'contradiction': 1, 'neutral': 2}

    self.train_df = train_df
    self.val_df = val_df

    self.base_path = '../content/'
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
    self.train_data = None
    self.val_data = None
    self.init_data()

  def init_data(self):
    # Saving takes too much RAM
    #
    # if os.path.exists(os.path.join(self.base_path, 'train_data.pkl')):
    #   print("Found training data")
    #   with open(os.path.join(self.base_path, 'train_data.pkl'), 'rb') as f:
    #     self.train_data = pickle.load(f)
    # else:
    #   self.train_data = self.load_data(self.train_df)
    #   with open(os.path.join(self.base_path, 'train_data.pkl'), 'wb') as f:
    #     pickle.dump(self.train_data, f)
    # if os.path.exists(os.path.join(self.base_path, 'val_data.pkl')):
    #   print("Found val data")
    #   with open(os.path.join(self.base_path, 'val_data.pkl'), 'rb') as f:
    #     self.val_data = pickle.load(f)
    # else:
    #   self.val_data = self.load_data(self.val_df)
    #   with open(os.path.join(self.base_path, 'val_data.pkl'), 'wb') as f:
    #     pickle.dump(self.val_data, f)
    self.train_data = self.load_data(self.train_df)
    self.val_data = self.load_data(self.val_df)

  def load_data(self, df):
    MAX_LEN = 512
    token_ids = []
    mask_ids = []
    seg_ids = []
    y = []

    premise_list = df['sentence1'].to_list()
    hypothesis_list = df['sentence2'].to_list()
    label_list = df['gold_label'].to_list()

    for (premise, hypothesis, label) in zip(premise_list, hypothesis_list, label_list):
      premise_id = self.tokenizer.encode(premise, add_special_tokens = False)
      hypothesis_id = self.tokenizer.encode(hypothesis, add_special_tokens = False)
      pair_token_ids = [self.tokenizer.cls_token_id] + premise_id + [self.tokenizer.sep_token_id] + hypothesis_id + [self.tokenizer.sep_token_id]
      premise_len = len(premise_id)
      hypothesis_len = len(hypothesis_id)

      segment_ids = torch.tensor([0] * (premise_len + 2) + [1] * (hypothesis_len + 1))  # sentence 0 and sentence 1
      attention_mask_ids = torch.tensor([1] * (premise_len + hypothesis_len + 3))  # mask padded values

      token_ids.append(torch.tensor(pair_token_ids))
      seg_ids.append(segment_ids)
      mask_ids.append(attention_mask_ids)
      y.append(self.label_dict[label])
    
    token_ids = pad_sequence(token_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    seg_ids = pad_sequence(seg_ids, batch_first=True)
    y = torch.tensor(y)
    dataset = TensorDataset(token_ids, mask_ids, seg_ids, y)
    print(len(dataset))
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    train_loader = DataLoader(
      self.train_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    val_loader = DataLoader(
      self.val_data,
      shuffle=shuffle,
      batch_size=batch_size
    )

    return train_loader, val_loader

In [13]:
mnli_dataset = MNLIDataBert(train_df, val_df)

100%|█████████████████████████████████████████████████████████████████████████████████████| 231508/231508 [00:01<00:00, 153457.74B/s]


1000
200


In [None]:
train_loader, val_loader = mnli_dataset.get_data_loaders(batch_size=16)

In [None]:
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

In [14]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
# This variable contains all of the hyperparemeter information our training loop needs
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)



In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')



In [None]:
def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc


In [None]:
import time

EPOCHS = 5

def train(model, train_loader, val_loader, optimizer):
  total_step = len(train_loader)

  for epoch in range(EPOCHS):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)
      # prediction = model(pair_token_ids, mask_ids, seg_ids)
      loss, prediction = model(pair_token_ids,
                             token_type_ids=seg_ids,
                             attention_mask=mask_ids,
                             labels=labels).values()

      # loss = criterion(prediction, labels)
      acc = multi_acc(prediction, labels)

      loss.backward()
      optimizer.step()

      total_train_loss += loss.item()
      total_train_acc  += acc.item()

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)
    model.eval()
    total_val_acc  = 0
    total_val_loss = 0
    with torch.no_grad():
      for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
        optimizer.zero_grad()
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)

        # prediction = model(pair_token_ids, mask_ids, seg_ids)
        loss, prediction = model(pair_token_ids,
                             token_type_ids=seg_ids,
                             attention_mask=mask_ids,
                             labels=labels).values()

        # loss = criterion(prediction, labels)
        acc = multi_acc(prediction, labels)

        total_val_loss += loss.item()
        total_val_acc  += acc.item()

    val_acc  = total_val_acc/len(val_loader)
    val_loss = total_val_loss/len(val_loader)
    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_acc: {train_acc:.4f} | val_loss: {val_loss:.4f} val_acc: {val_acc:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [None]:
train(model, train_loader, val_loader, optimizer)


In [None]:
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 433/433 [00:00<00:00, 72532.20B/s]
  3%|██▏                                                                           | 12324864/440473133 [01:21<1:02:38, 113901.82B/s]