In [1]:
!pip install torchtext==0.10.0



In [2]:
!pip install transformers



In [3]:
import pandas as pd
import numpy as np
import os
import re
import torch
import torch.nn as nn
import torch.nn.functional as F
import json
from transformers import DataProcessor, InputExample, InputFeatures
from transformers import BertModel

In [4]:
bert_model_type = 'bert-base-uncased'
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(bert_model_type)
cls_token = tokenizer.cls_token
sep_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
print(cls_token, sep_token, pad_token, unk_token)

cls_token_idx = tokenizer.cls_token_id
sep_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(cls_token_idx, sep_token_idx, pad_token_idx, unk_token_idx)

label_conversion = {'n':0, #neutral
'e':1, #entailment
'c':2} #contradiction

max_input_length = tokenizer.max_model_input_sizes[bert_model_type]
print(max_input_length)

# BATCH_SIZE = 16
BATCH_SIZE = 1

def read_jsonl(path):
    with open(path, "r", encoding="utf-8-sig") as f:
            lines = f.readlines()
            return [json.loads(line) for line in lines]

def create_examples(filename):
        """Creates examples for the training, dev and test sets."""
        examples = []

        data = read_jsonl(filename)
        for (i, line) in enumerate(data):
            guid = "%s-%s" % ("anli-bert-tf", i)
            premise = line['context'] 
            hypothesis = line['hypothesis']
            label = line['label']
            examples.append(InputExample(guid=guid, text_a=premise, text_b=hypothesis, label=label))
        return examples

#Get list of 0s 
def get_sent1_token_type(sent):
    try:
        return [0]* len(sent)
    except:
        return []
#Get list of 1s
def get_sent2_token_type(sent):
    try:
        return [1]* len(sent)
    except:
        return []

def pad_sequence(sequence, max_seq_length=max_input_length, pad_token=pad_token):
    ''' 
    Pads the sequence to the max_seq_length.
    '''
    #sequence = sequence.split(" ")
    sequence = sequence[:max_seq_length]
    sequence = sequence + [pad_token]*(max_seq_length - len(sequence))
    return sequence

def pad_attention_mask(attention_mask, max_seq_length=max_input_length):
    ''' 
    Pads the attention mask to the max_seq_length.
    '''
    #attention_mask = attention_mask.split(" ")
    attention_mask = attention_mask[:max_seq_length]
    attention_mask = attention_mask + [0]*(max_seq_length - len(attention_mask))
    return attention_mask


def pad_token_type(token_type, max_seq_length=max_input_length):
    ''' 
    Pads the token type to the max_seq_length.
    '''
    #token_type = token_type.split(" ")
    token_type = token_type[:max_seq_length]
    token_type = token_type + [1]*(max_seq_length - len(token_type))
    return token_type

def split_and_cut(sentence):
    tokens = sentence.strip().split(" ")
    tokens = tokens[:max_input_length]
    return tokens

def convert_list_to_str(token):
    return ''.join(str(e) for e in token)
    
def convert_to_int(token):
    return [int(x) for x in token]


def preprocess_data_for_bert(path, max_seq_length=max_input_length, tokenizer=tokenizer):
    ''' 
    Preprocesses the anli jsonl data for BERT.
    '''
    dataset = create_examples(path)
    df = pd.DataFrame(columns=['label', 'sequence', 'attention_mask', 'token_type', 'sentence1', 'sentence2'])
    for i, example in enumerate(dataset):
        sent1 = tokenizer.tokenize(example.text_a)
        sent1 = [cls_token] + sent1 + [sep_token]
        sent2 = tokenizer.tokenize(example.text_b)
        sent2 = sent2 + [sep_token]
        final_sent = sent1 + sent2
        label = example.label
        attention_mask = [1]*len(final_sent)
        #attention_mask = convert_list_to_str(attention_mask)
        token_type = get_sent1_token_type(sent1)+ get_sent2_token_type(sent2)
        #token_type = convert_list_to_str(token_type)
        #final_sent = " ".join(final_sent)
        final_sent = pad_sequence(final_sent)
        final_sent = tokenizer.convert_tokens_to_ids(final_sent)
        attention_mask = pad_attention_mask(attention_mask)
        token_type = pad_token_type(token_type)
        df.loc[i] = [np.array(label_conversion[label]), np.array(final_sent), np.array(attention_mask), np.array(token_type), np.array(sent1), np.array(sent2)]
        #df.loc[i] = [label, final_sent, attention_mask, token_type, sent1, sent2]

    return df

# !ls

df_T = preprocess_data_for_bert("./data/anli_v1.0/anli_v1.0/R1/train.jsonl")
# df_T = preprocess_data_for_bert("./data/train.jsonl")

df_T.head()

def convert_to_tuples(df):
    '''
    Converts the dataframe to list of tuples.
    '''
    ds = []
    for index, row in df.iterrows():
        ds.append((row['label'], row['sequence'], row['attention_mask'], row['token_type']))
    
    return ds

from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

class BertAnliProcessor():
    """Processor for the ANLI data set."""

    def __init__(self, data_dir):
        self.data_dir = data_dir
        # self.dev_data = 
        self.train_data = preprocess_data_for_bert(data_dir+"train.jsonl")
        # self.dev_data = preprocess_data_for_bert(data_dir+"dev.jsonl")
        # self.test_data = preprocess_data_for_bert(data_dir+"test.jsonl")
        self.train_data.to_csv(data_dir+"train_processed.csv", index=False)
        # self.dev_data.to_csv(data_dir+"dev_processed.csv", index=False)
        # self.test_data.to_csv(data_dir+"test_processed.csv", index=False)

    def get_train_dataloader(self):
        """
        Formats the train data into a DataLoader.
        tuple : (label, sequence, attention_mask, token_type)
        """
        ds = convert_to_tuples(self.train_data)
        loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True)
        return loader

    def get_dev_dataloader(self):
        """
        Formats the dev data into a DataLoader.
        tuple : (label, sequence, attention_mask, token_type)
        """
        ds = convert_to_tuples(self.dev_data)
        loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True)
        return loader


    def get_test_dataloader(self):
        """
        Formats the test data into a DataLoader.
        tuple : (label, sequence, attention_mask, token_type)
        """
        ds = convert_to_tuples(self.test_data)
        loader = DataLoader(ds, batch_size=BATCH_SIZE, shuffle=True)
        return loader

obj = BertAnliProcessor('./data/anli_v1.0/anli_v1.0/R1/')

loader = obj.get_train_dataloader()

x = next(iter(loader))
# for batch_idx, label, sequence, attention_mask, token_type in enumerate(loader):
#     print(label)
#     print(sequence)
#     print(attention_mask)
#     print(token_type)
#     # print(batch_idx)
#     # print(sequence)
#     # print(attn_mask)
#     # print(token_type)

print(x)


[CLS] [SEP] [PAD] [UNK]
101 102 0 100
512
[tensor([2]), tensor([[  101,  2198,  2726,  5671,  1006,  2089,  1022,  1010, 12522,  1516,
          2255,  2403,  1010,  6166,  1007,  2001,  1037,  9137,  1011,  2301,
          3761,  1010,  5160,  1998,  3648,  2013,  3448,  1012,  2002,  2411,
          3615,  2000,  2044,  1996,  2137,  2942,  2162,  2004,  1000,  3648,
          5671,  1000,  1010,  2130,  2044,  2010,  2602,  2000,  3519,  1012,
          2002,  2001,  1996,  2034,  5542,  1997,  2198,  2940,  1012,   102,
          5671,  2001,  1037,  3648,  1999,  5374,   102,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,    

# Pre-Processing


Converting the dataset into the form required by the pre-trained BERT-Base Model.

In [5]:
# using the same tokenizer used in pre-training
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", output_hidden_states=True)

In [6]:
# defining the maximum length of the input sentences
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
# defining the maximum length of each sentence
max_sentence_length = 128

In [7]:
# function to tokenize the sentences using BertTokenizer
def tokenize_sentences(sentence):
  tokens = tokenizer.tokenize(sentence)
  return tokens

In [8]:
# function to reduce the size of the sentence to the max_input_length
def reduce_sentence_length(sentence):
  tokens = sentence.strip().split(" ")
  tokens = tokens[:max_input_length]
  return tokens

In [9]:
# function to trim the sentence to the max_sentence_length
def trim_sentence(sentence):
  # splitting the sentence
  sentence = sentence.split()
  # check if the sentence has 128 or more tokens
  if len(sentence) >= 128:
    sentence = sentence[:max_sentence_length]
  return " ".join(sentence)

Token type ids help the model to know which token belongs to which sentence. For tokens of the first sentence in input, token type ids contain 0 and for second sentence tokens, it contains 1.

In [10]:
# function to get the token type id's of the sentence-01
def token_type_ids_sent_01(sentence):
  try:
    return [0] * len(sentence)
  except:
    return []

In [11]:
# function to get the token type id's of the sentence-02
def token_type_ids_sent_02(sentence):
  try:
    return [1] * len(sentence)
  except:
    return []

Attention mask helps the model to know the useful tokens and padding that is done during batch preparation. Attention mask is basically a sequence of 1’s with the same length as input tokens.

In [12]:
# function to get the attention mask of the given sentence
def attention_mask_sentence(sentence):
  try:
    return [1] * len(sentence)
  except:
    return []

In [13]:
# function to combine the sequences from lists
def combine_sequence(sequence):
  return " ".join(sequence)

# function to combine the masks
def combine_mask(mask):
  mask = [str(m) for m in mask]
  return " ".join(mask)

In [14]:
# trimming the sentences upto the maximum length
df_train['sentence1'] = df_train['sentence1'].apply(trim_sentence)
df_dev['sentence1'] = df_dev['sentence1'].apply(trim_sentence)
df_test['sentence1'] = df_test['sentence1'].apply(trim_sentence)

df_train['sentence2'] = df_train['sentence2'].apply(trim_sentence)
df_dev['sentence2'] = df_dev['sentence2'].apply(trim_sentence)
df_test['sentence2'] = df_test['sentence2'].apply(trim_sentence)

NameError: name 'df_train' is not defined

In [None]:
# adding the [cls] and [sep] tokens
df_train['t_sentence1'] = cls_token + ' ' + df_train['sentence1'] + ' ' + sep_token + ' '
df_dev['t_sentence1'] = cls_token + ' ' + df_dev['sentence1'] + ' ' + sep_token + ' '
df_test['t_sentence1'] = cls_token + ' ' + df_test['sentence1'] + ' ' + sep_token + ' '

df_train['t_sentence2'] = df_train['sentence2'] + ' ' + sep_token
df_dev['t_sentence2'] = df_dev['sentence2'] + ' ' + sep_token
df_test['t_sentence2'] = df_test['sentence2'] + ' ' + sep_token

In [None]:
# applying the BertTokenizer to the newly generated sentences
df_train['b_sentence1'] = df_train['t_sentence1'].apply(tokenize_sentences)
df_dev['b_sentence1'] = df_dev['t_sentence1'].apply(tokenize_sentences)
df_test['b_sentence1'] = df_test['t_sentence1'].apply(tokenize_sentences)

df_train['b_sentence2'] = df_train['t_sentence2'].apply(tokenize_sentences)
df_dev['b_sentence2'] = df_train['t_sentence2'].apply(tokenize_sentences)
df_test['b_sentence2'] = df_test['t_sentence2'].apply(tokenize_sentences)

In [None]:
# getting the token type ids for the sentences
df_train['sentence1_token_type'] = df_train['b_sentence1'].apply(token_type_ids_sent_01)
df_dev['sentence1_token_type'] = df_dev['b_sentence1'].apply(token_type_ids_sent_01)
df_test['sentence1_token_type'] = df_test['b_sentence1'].apply(token_type_ids_sent_01)

df_train['sentence2_token_type'] = df_train['b_sentence2'].apply(token_type_ids_sent_02)
df_dev['sentence2_token_type'] = df_dev['b_sentence2'].apply(token_type_ids_sent_02)
df_test['sentence2_token_type'] = df_test['b_sentence2'].apply(token_type_ids_sent_02)

In [None]:
# obtain the seqence from the tokenized sentences
df_train['sequence'] = df_train['b_sentence1'] + df_train['b_sentence2']
df_dev['sequence'] = df_dev['b_sentence1'] + df_dev['b_sentence2']
df_test['sequence'] = df_test['b_sentence1'] + df_test['b_sentence2']

In [None]:
# generating attention mask 
df_train['attention_mask'] = df_train['sequence'].apply(attention_mask_sentence)
df_dev['attention_mask'] = df_dev['sequence'].apply(attention_mask_sentence)
df_test['attention_mask'] = df_test['sequence'].apply(attention_mask_sentence)

In [None]:
# combining the token type of both sentences
df_train['token_type'] = df_train['sentence1_token_type'] + df_train['sentence2_token_type']
df_dev['token_type'] = df_dev['sentence1_token_type'] + df_train['sentence2_token_type']
df_test['token_type'] = df_test['sentence1_token_type'] + df_test['sentence2_token_type']

Dropping the rows with NaN Sequence

In [None]:
df_dev.shape

(491, 12)

In [None]:
from collections.abc import Iterable
testing_sequence = df_dev['sequence'].to_list()
for i in testing_sequence:
  if not isinstance(i, Iterable):
    print(i)

NameError: name 'df_dev' is not defined

In [None]:
df_dev = df_dev.dropna(subset = ['sequence'])
df_dev.shape

(488, 12)

In [None]:
# Converting the inputs to sequential for torchtext Field
df_train['sequence'] = df_train['sequence'].apply(combine_sequence)
df_dev['sequence']  = df_dev['sequence'].apply(combine_sequence)
df_test['sequence'] = df_test['sequence'].apply(combine_sequence)

NameError: name 'df_train' is not defined

In [None]:
df_train['attention_mask'] = df_train['attention_mask'].apply(combine_mask)
df_dev['attention_mask'] = df_dev['attention_mask'].apply(combine_mask)
df_test['attention_mask'] = df_test['attention_mask'].apply(combine_mask)

In [None]:
df_train['token_type'] = df_train['token_type'].apply(combine_mask)
df_dev['token_type'] = df_dev['token_type'].apply(combine_mask)
df_test['token_type'] = df_test['token_type'].apply(combine_mask)

In [None]:
# extracting the required columns
df_train = df_train[['gold_label', 'sequence', 'attention_mask', 'token_type']]
df_dev = df_dev[['gold_label', 'sequence', 'attention_mask', 'token_type']]
df_test = df_test[['gold_label', 'sequence', 'attention_mask', 'token_type']]

In [None]:
# saving the data in the files
df_train.to_csv('snli_1.0/snli_1.0_train.csv', index=False)
df_dev.to_csv('snli_1.0/snli_1.0_dev.csv', index=False)
df_test.to_csv('snli_1.0/snli_1.0_test.csv', index=False)

In [None]:
!ls snli_1.0

Icon		  snli_1.0_dev.jsonl  snli_1.0_test.jsonl  snli_1.0_train.jsonl
README.txt	  snli_1.0_dev.txt    snli_1.0_test.txt    snli_1.0_train.txt
snli_1.0_dev.csv  snli_1.0_test.csv   snli_1.0_train.csv


In [None]:
df_train.head()

NameError: name 'df_train' is not defined

In [None]:
# function to convert the attention_mask and token_type ids to int
def convert_to_int(ids):
  ids = [int(d) for d in ids]
  return ids

Create PyTorch Tensor using torchtext field

In [None]:
# importing the saved data from csv file
df_train = pd.read_csv('snli_1.0/snli_1.0_train.csv')
df_dev = pd.read_csv('snli_1.0/snli_1.0_dev.csv')
df_test = pd.read_csv('snli_1.0/snli_1.0_test.csv')

In [None]:
from torchtext.legacy import data

In [None]:
# text field for sequence
TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = reduce_sentence_length,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)
# label field for label 
LABEL = data.LabelField()
# text field for attention mask
ATTENTION = data.Field(batch_first = True,
                       use_vocab = False,
                       tokenize = reduce_sentence_length,
                       preprocessing = convert_to_int,
                       pad_token = pad_token_idx)
# text field for token type ids
TTYPE = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = reduce_sentence_length,
                  preprocessing = convert_to_int,
                  pad_token = 1)

NameError: name 'data' is not defined

In [None]:
fields = [('label', LABEL), ('sequence', TEXT), ('attention_mask', ATTENTION), ('token_type', TTYPE)]

In [None]:
train_data, valid_data, test_data = data.TabularDataset.splits(
                                        path = 'snli_1.0',
                                        train = 'snli_1.0_train.csv',
                                        validation = 'snli_1.0_dev.csv',
                                        test = 'snli_1.0_test.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = True)
train_data_len = len(train_data)

NameError: name 'data' is not defined

In [None]:
# building the vocabulary for the labels
LABEL.build_vocab(train_data)

NameError: name 'train_data' is not defined

In [None]:
# using bucketiterator for preparing batches for training
BATCH_SIZE = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_key = lambda x: len(x.sequence),
    sort_within_batch = False, 
    device = device)
    


NameError: name 'data' is not defined

# Model Training


Using the pre-trained Bert_Model

In [None]:
from transformers import BertModel
bert_model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Using BERT architecture along with one linear layer for the output prediction

In [None]:
class CLUBv2(nn.Module):  # CLUB: Mutual Information Contrastive Learning Upper Bound
    def __init__(self, x_dim, y_dim, lr=1e-3, beta=0):
        super(CLUBv2, self).__init__()
        self.hiddensize = y_dim
        self.version = 2
        self.beta = beta

    def mi_est_sample(self, x_samples, y_samples):
        sample_size = y_samples.shape[0]
        random_index = torch.randint(sample_size, (sample_size,)).long()

        positive = torch.zeros_like(y_samples)
        negative = - (y_samples - y_samples[random_index]) ** 2 / 2.
        upper_bound = (positive.sum(dim=-1) - negative.sum(dim=-1)).mean()
        # return upper_bound/2.
        return upper_bound

    def mi_est(self, x_samples, y_samples):  # [nsample, 1]
        positive = torch.zeros_like(y_samples)

        prediction_1 = y_samples.unsqueeze(1)  # [nsample,1,dim]
        y_samples_1 = y_samples.unsqueeze(0)  # [1,nsample,dim]
        negative = - ((y_samples_1 - prediction_1) ** 2).mean(dim=1) / 2.   # [nsample, dim]
        return (positive.sum(dim=-1) - negative.sum(dim=-1)).mean()
        # return (positive.sum(dim = -1) - negative.sum(dim = -1)).mean(), positive.sum(dim = -1).mean(), negative.sum(dim = -1).mean()

    def loglikeli(self, x_samples, y_samples):
        return 0

    def update(self, x_samples, y_samples, steps=None):
        # no performance improvement, not enabled
        if steps:
            beta = self.beta if steps > 1000 else self.beta * steps / 1000  # beta anealing
        else:
            beta = self.beta

        return self.mi_est_sample(x_samples, y_samples) * self.beta

club = CLUBv2(x_dim=10,y_dim = 10, beta=5e-3).to(device)



In [None]:
import torch.nn as nn
from packaging import version


class BERTNLIModel(nn.Module):
    def __init__(self, bert_model, hidden_dim, output_dim):
        super().__init__()
        self.bert = bert_model
        embedding_dim = bert_model.config.to_dict()['hidden_size']
        self.out = nn.Linear(embedding_dim, output_dim)


    def forward(self, sequence, attn_mask, token_type):
        embedded = self.bert(input_ids = sequence, attention_mask = attn_mask, token_type_ids = token_type)
        embed = embedded[1]
        outputs = self.out(embed)
        hidden_states = embedded[2]  # need to set config.output_hidden = True
        first_state = hidden_states[0]
        last_state = hidden_states[-1]
        
        return (outputs, first_state, last_state)
    


In [None]:
# loading the model
HIDDEN_DIM = 512
# OUTPUT_DIM = len(LABEL.vocab)
OUTPUT_DIM = 3
model = BERTNLIModel(bert_model, HIDDEN_DIM, OUTPUT_DIM).to(device)

In [None]:
# function to count the parameters of the model
def count_parameters(model):
  return sum(p.numel() for p in model.parameters() if p. requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,484,547 trainable parameters


Using the Apex Nvidia a PyTorch extension for mixed precision and distributed training

In [None]:
# %%writefile setup.sh

# git clone https://github.com/NVIDIA/apex
# cd apex
# pip install -v --disable-pip-version-check --no-cache-dir ./

In [None]:
# !sh setup.sh

# !pip install torch==1.4+cu100 torchvision==0.5.0+cu100 -f https://download.pytorch.org/whl/torch_stable.html



Defining the loss function and optimizer for our model

In [None]:
from transformers.optimization import *
# from apex import amp
import torch.optim as optim
optimizer = AdamW(model.parameters(),lr=2e-5,eps=1e-6,correct_bias=False)
# def get_scheduler(optimizer, warmup_steps):
#     scheduler = get_constant_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps)
#     return scheduler

In [None]:
# using the cross entropy loss
criterion = nn.CrossEntropyLoss().to(device)

In [None]:
# fp16 = False

# if fp16:
#     try:
#         from apex import amp
#     except ImportError:
#         raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
#     model, optimizer = amp.initialize(model, optimizer, opt_level='O1')

###################################33 ONLINE CODE ########################################
# USE_APEX = True

# import os, sys, shutil
# import time
# import gc
# from contextlib import contextmanager
# from pathlib import Path
# import random
# import numpy as np, pandas as pd
# from tqdm import tqdm, tqdm_notebook

# @contextmanager
# def timer(name):
#     t0 = time.time()
#     yield
#     print(f'[{name}] done in {time.time() - t0:.0f} s')


# if USE_APEX:
#             with timer('install Nvidia apex'):
#                 # Installing Nvidia Apex
#                 os.system('git clone https://github.com/NVIDIA/apex; cd apex; pip install -v --no-cache-dir' + 
#                           ' --global-option="--cpp_ext" --global-option="--cuda_ext" ./')
#                 os.system('rm -rf apex/.git') # too many files, Kaggle fails
#                 from apex import amp
#                 model, optimizer = amp.initialize(model, optimizer, opt_level='O1', verbosity=0)

##########################################################################################


# optimizer = AdamW(model.parameters(),lr=2e-5,eps=1e-6,correct_bias=False)
# # define model as bert model
# model = BERTNLIModel(bert_model, HIDDEN_DIM, OUTPUT_DIM,).to(device)


In [None]:
# function to calculate the accuracy of model
def accuracy(pred, y):
    max_preds = pred.argmax(dim = 1, keepdim = True)
    correct = (max_preds.squeeze(1)==y).float()
    return correct.sum() / len(y)

In [None]:
max_grad_norm = 1
epoch_loss = 0
epoch_acc = 0
model.train()

BERTNLIModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)

In [None]:
for batch_idx, samples in enumerate(loader):
    label, sequence, attention_mask, token_type  = samples
    # print(label)
    # print(sequence)
    # print(attention_mask)
    # print(token_type)
    optimizer.zero_grad() # clear gradients first
    # torch.cuda.empty_cache() # releases all unoccupied cached memory
    sequence = sequence.to(device)
    attn_mask = attention_mask.to(device)
    token_type = token_type.to(device)
    label = label.to(device)
    predictions , firststate, laststate = model(sequence, attn_mask, token_type)
    # print(firststate.size())
    # print(laststate.size())
    loss1 = criterion(predictions, label)
    loss2= club.update(firststate, laststate)
    loss = loss1 + loss2

    acc = accuracy(predictions, label)

    loss.backward()
    optimizer.step()
    print(loss)
    # scheduler.step()
    epoch_loss += loss.item()
    epoch_acc += acc.item()
    torch.save(model.state_dict(),"infobertr3")

tensor(0.9201, grad_fn=<AddBackward0>)


KeyboardInterrupt: 

In [None]:

def train(model, iterator, optimizer, criterion, scheduler):
  epoch_loss = 0
  epoch_acc = 0
  model.train()
  for batch in iterator:
    optimizer.zero_grad() # clear gradients first
    torch.cuda.empty_cache() # releases all unoccupied cached memory
    sequence = batch.sequence
    attn_mask = batch.attention_mask
    token_type = batch.token_type
    label = batch.label
    predictions = model(sequence, attn_mask, token_type)
    loss = criterion(predictions, label)
    acc = categorical_accuracy(predictions, label)
    if mp:
      with amp.scale_loss(loss, optimizer) as scaled_loss:
        scaled_loss.backward()
        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), max_grad_norm)
    else:
      loss.backward()
      optimizer.step()
      scheduler.step()
      epoch_loss += loss.item()
      epoch_acc += acc.item()
  return epoch_loss / len(iterator), epoch_acc / len(iterator)

# Model Testing

In [None]:
# def evaluate(model, iterator, criterion):
#     #print(iterator)
#     epoch_loss = 0
#     epoch_acc = 0
#     model.eval()
#     with torch.no_grad():
#         for batch in iterator:
#             sequence = batch.sequence
#             attn_mask = batch.attention_mask
#             token_type = batch.token_type
#             labels = batch.label
#             predictions = model(sequence, attn_mask, token_type)
#             loss = criterion(predictions, labels)
#             acc = accuracy(predictions, labels)
#             epoch_loss += loss.item()
#             epoch_acc += acc.item()
#     return epoch_loss / len(iterator), epoch_acc / len(iterator)

print(len(loader))

def evaluate(model):
    # make function to evaluate the model
    model.eval()
    with torch.no_grad():
        epoch_loss = 0
        epoch_acc = 0
        correct = 0
        total = 0
        for batch_idx, samples in enumerate(loader):
            label, sequence, attention_mask, token_type  = samples
            sequence = sequence.to(device)
            attn_mask = attention_mask.to(device)
            token_type = token_type.to(device)
            label = label.to(device)
            predictions , firststate, laststate = model(sequence, attn_mask, token_type)
            max_preds = predictions.argmax(dim = 1, keepdim = True)
            correct += (max_preds.squeeze(1)==label).float().sum()
            total += label.size(0)
            loss = criterion(predictions, label)
            acc = accuracy(predictions, label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        return epoch_loss / len(loader), epoch_acc / len(loader)

# evaluate the model
test_loss, test_acc = evaluate(model)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

        

501
Test Loss: 0.526 | Test Acc: 85.03%


In [None]:
import math
N_EPOCHS = 1

warmup_percent = 0.2
total_steps = math.ceil(N_EPOCHS * train_data_len * 1./BATCH_SIZE)
warmup_steps = int(total_steps*warmup_percent)
# scheduler = get_scheduler(optimizer, warmup_steps)

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, scheduler)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'infobert-nli.pt')
    
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

NameError: name 'train_data_len' is not defined

In [None]:
model.load_state_dict(torch.load('infobertr3'))

# test_loss, test_acc = evaluate(model, test_iterator, criterion)

# print(f'Test Loss: {test_loss:.3f} |  Test Acc: {test_acc*100:.2f}%')


# # make evaluatre function
# def evaluate(model, iterator, criterion):
#     epoch_loss = 0
#     epoch_acc = 0
#     model.eval()
#     with torch.no_grad():
#         for batch in iterator:
#             sequence = batch.sequence
#             attn_mask = batch.attention_mask
#             token_type = batch.token_type
#             labels = batch.label
#             predictions = model(sequence, attn_mask, token_type)
#             loss = criterion(predictions, labels)
#             acc = accuracy(predictions, labels)
#             epoch_loss += loss.item()
#             epoch_acc += acc.item()
#     return epoch_loss / len(iterator), epoch_acc / len(iterator)


def evaluate(model):
    # make function to evaluate the model
    model.eval()
    with torch.no_grad():
        correct = 0
        total = 0
        for batch_idx, samples in enumerate(loader):
            label, sequence, attention_mask, token_type  = samples
            sequence = sequence.to(device)
            attn_mask = attention_mask.to(device)
            token_type = token_type.to(device)
            label = label.to(device)
            predictions , firststate, laststate = model(sequence, attn_mask, token_type)
            max_preds = predictions.argmax(dim = 1, keepdim = True)
            correct += (max_preds.squeeze(1)==label).float().sum()
            total += label.size(0)
            loss = criterion(predictions, label)
            acc = accuracy(predictions, label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            print(f'Epoch [{epoch+1}/{N_EPOCHS}], Step [{batch_idx+1}/{len(loader)}], Loss: {loss.item():.4f}, Accuracy: {acc.item()*100:.2f}%')
        print(f'Accuracy of the model on the test set: {correct/total*100:.2f}%')
    return correct/total

# evaluate the model
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')


NameError: name 'test_iterator' is not defined

In [None]:

# function to get the results on custom inputs
def predict_inference(premise, hypothesis, model, device):

    # appending the 'cls' and 'sep' tokens 
    premise = cls_token + ' ' + premise + ' ' + sep_token
    hypothesis = hypothesis + ' ' + sep_token
    
    # tokenize the premise and hypothesis using bert tokenizer
    tokenize_premise = tokenize_sentences(premise)
    tokenize_hypothesis = tokenize_sentences(hypothesis)

    # generate the token type ids of both premise and hypothesis
    premise_token_type = token_type_ids_sent_01(tokenize_premise)
    hypothesis_token_type = token_type_ids_sent_02(tokenize_hypothesis)
    
    # combining the tokenized premise and hypothesis to generate the sequence
    indexes = tokenize_premise + tokenize_hypothesis
    
    # converting the sequence of tokens into token ids
    indexes = tokenizer.convert_tokens_to_ids(indexes)

    # combining the premise and hypothesis tokens ids
    indexes_type = premise_token_type + hypothesis_token_type
    
    # generating the attention mask of the ids
    attention_mask = token_type_ids_sent_02(indexes)
    
    # creating the pytorch tensors of indexes, indexes_type, attention_mask
    indexes = torch.LongTensor(indexes).unsqueeze(0).to(device)
    indexes_type = torch.LongTensor(indexes_type).unsqueeze(0).to(device)
    attention_mask = torch.LongTensor(attention_mask).unsqueeze(0).to(device)
    
    # predicting to get the judgements
    prediction = model(indexes, attention_mask, indexes_type)
    
    prediction = prediction.argmax(dim=-1).item()
    
    return LABEL.vocab.itos[prediction]

In [None]:
premise = 'A black race car starts up in front of a crowd of people.'
hypothesis = 'A man is driving down a lonely road.'

predict_inference(premise, hypothesis, model, device)

'neutral'

In [None]:
premise = 'A soccer game with multiple males playing.'
hypothesis = 'Some men are playing a sport.'

predict_inference(premise, hypothesis, model, device)

'neutral'

In [None]:
premise = 'A smiling costumed woman is holding an umbrella.'
hypothesis = 'A happy woman in a fairy costume holds an umbrella.'

predict_inference(premise, hypothesis, model, device)

'neutral'