# Data preparation

All cells from here to the next header must be executed before running any model

In [19]:
import pandas as pd
import numpy as np
df_train=pd.read_csv('train.csv')

In [20]:
df_train = df_train[df_train['language']=='English']

In [21]:
df_test=pd.read_csv('test.csv')
df_test = df_test[df_test['language']=='English'].reset_index()

In [22]:
df_train

Unnamed: 0,id,premise,hypothesis,lang_abv,language,label
0,5130fd2cb5,and these comments were considered in formulat...,The rules developed in the interim were put to...,en,English,0
1,5b72532a0b,These are issues that we wrestle with in pract...,Practice groups are not permitted to work on t...,en,English,2
3,5622f0c60b,you know they can't really defend themselves l...,They can't defend themselves because of their ...,en,English,0
7,fdcd1bd867,From Cockpit Country to St. Ann's Bay,From St. Ann's Bay to Cockpit Country.,en,English,2
8,7cfb3d272c,"Look, it's your skin, but you're going to be i...",The boss will fire you if he sees you slacking...,en,English,1
...,...,...,...,...,...,...
12115,2b78e2a914,The results of even the most well designed epi...,All studies have the same amount of uncertaint...,en,English,2
12116,7e9943d152,But there are two kinds of the pleasure of do...,But there are two kinds of the pleasure of doi...,en,English,0
12117,5085923e6c,The important thing is to realize that it's wa...,"It cannot be moved, now or ever.",en,English,2
12118,fc8e2fd1fe,At the west end is a detailed model of the who...,The model temple complex is at the east end.,en,English,2


In [23]:
def one_hot_target(t):
    v=np.zeros(3,dtype='int')
    v[t]=1
    return v

In [24]:
df_train['label']=df_train['label'].apply(one_hot_target)

In [25]:
df_train['premise'].str.split().apply(len).max()   #longest sequence of tokens in premises

196

In [26]:
df_train['hypothesis'].str.split().apply(len).max()   #longest sequence of tokens in hypothesis

46

In [27]:
(df_train['premise'].str.split().apply(len)+df_train['hypothesis'].str.split().apply(len)).max() #longest premise+hyp sequence

216

In [28]:
maxlen=216  #set max sequence len

In [29]:
len(df_train)*.1

687.0

In [30]:
#Valuation dataset
df_val = df_train.sample(687)

In [31]:
df_train=df_train[~df_train.index.isin(df_val.index)]

In [32]:
df_val = df_val[['premise','hypothesis','label']]

In [33]:
df_train = df_train[['premise','hypothesis','label']].reset_index()

In [34]:
df_val.reset_index(inplace=True)

In [35]:
import torch
import torch.nn as nn
import os
import matplotlib.pyplot as plt
import copy
import torch.optim as optim
import random
import numpy as np
import pandas as pd
from torch.utils.data import DataLoader, Dataset
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModel
from datasets import load_dataset, load_metric
from torch.optim.lr_scheduler import LinearLR

In [36]:
from torch.utils.data import Dataset
class CustomDataset(Dataset):

    def __init__(self, data, maxlen, with_labels=True, bert_model='albert-base-v2'):

        self.data = data  # pandas dataframe
        #Initialize the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(bert_model)  

        self.maxlen = maxlen
        self.with_labels = with_labels 

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):

        # Selecting sentence1 and sentence2 at the specified index in the data frame
        sent1 = str(self.data.loc[index, 'premise'])
        sent2 = str(self.data.loc[index, 'hypothesis'])

        # Tokenize the pair of sentences to get token ids, attention masks and token type ids
        encoded_pair = self.tokenizer(sent1, sent2, 
                                      padding='max_length',  # Pad to max_length
                                      truncation=True,  # Truncate to max_length
                                      max_length=self.maxlen,  
                                      return_tensors='pt')  # Return torch.Tensor objects
        if bert_model == "distilbert-base-uncased":
            token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
            attn_masks = encoded_pair['attention_mask'].squeeze(0)
            if self.with_labels:  # True if the dataset has labels
                label = self.data.loc[index, 'label']
                return token_ids, attn_masks,  label  
            else:
                return token_ids, attn_masks
        else:
            token_ids = encoded_pair['input_ids'].squeeze(0)  # tensor of token ids
            attn_masks = encoded_pair['attention_mask'].squeeze(0)  # binary tensor with "0" for padded values and "1" for the other values
            token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

        if self.with_labels:  # True if the dataset has labels
            label = self.data.loc[index, 'label']
            return token_ids, attn_masks, token_type_ids, label  
        else:
            return token_ids, attn_masks, token_type_ids

In [37]:
def set_seed(seed):
    """ Set all seeds to make results reproducible """
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    

def evaluate_loss(net, device, criterion, dataloader):
    net.eval()

    mean_loss = 0
    count = 0

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            count += 1

    return mean_loss / count

In [38]:
#our metric
def accuracy(net, device, dataloader):
    net.eval()

    right = 0
    count = 0

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            
            logits = net(seq, attn_masks, token_type_ids)
            pred = torch.argmax(logits,dim=-1)
            
            right += torch.sum(pred==torch.argmax(labels,dim=-1)).item()                 #number of correct predictions in the batch
            count += seq.size()[0]                         #batch size

    return right / count

In [46]:
#set some useful variables
device = torch.device("cuda:0")
bert_model = "albert-base-v2"  # 'albert-base-v2', 'albert-large-v2', 'albert-xlarge-v2', 'albert-xxlarge-v2', 'bert-base-uncased', ...
maxlen = 216  # maximum length of the tokenized input sentence pair : if greater than "maxlen", the input is truncated and else if smaller, the input is padded
bs = 64  # batch size
iters_to_accumulate = 1  # the gradient accumulation adds gradients over an effective batch of size : bs * iters_to_accumulate. If set to "1", you get the usual batch size
lr = 2e-5  # learning rate
epochs = 4  # number of training epochs

# The Bert models

All code from here to the next header must be executed before running one of the Bert models

In [43]:
def train_bert(net, criterion, opti, lr, train_loader, val_loader, epochs):

    best_loss = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # print the training loss 5 times per epoch
    iters = []
    train_losses = []
    val_losses = []

    scaler = GradScaler()

    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):
            opti.zero_grad()
            # Convert to cuda 
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
    
            
            logits = net(seq, attn_masks, token_type_ids)

           
            loss = criterion(logits.squeeze(-1), labels.float())
            
            loss.backward()
            opti.step()
            
            
            running_loss += loss.item()

            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                running_loss = 0.0


        val_loss = evaluate_loss(net, device, criterion, val_loader)  # Compute validation loss
        print()
        print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))

        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1

    # Saving the model
    path_to_model='models{}_fineTune_lr_{}_val_loss_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()

In [41]:
class SentencePairClassifier(nn.Module):

    def __init__(self, bert_model="albert-base-v2", freeze_bert=False):
        super(SentencePairClassifier, self).__init__()
        #  Instantiating BERT-based model object
        self.bert_layer = AutoModel.from_pretrained(bert_model,output_hidden_states = True)

        #  Fix the hidden-state size of the encoder outputs (If you want to add other pre-trained models here, search for the encoder output size)
        if bert_model == "albert-base-v2":  # 12M parameters
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 18M parameters
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 60M parameters
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 235M parameters
            hidden_size = 4096
        elif bert_model == "bert-base-uncased": # 110M parameters
            hidden_size = 768

        # Freeze bert layers and only train the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        # Classification layer
        self.cls_layer = nn.Linear(hidden_size, 3)

        self.dropout = nn.Dropout(p=0.1)

    @autocast()  # run in mixed precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''

        # Feeding the inputs to the BERT-based model to obtain contextualized representations
        pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)['hidden_states'][-1][:,0,:]
            

        
        logits = self.cls_layer(self.dropout(pooler_output))

        return logits

# Albert with fine tuning

In [None]:
#Run this for training

#  Set all seeds to make reproducible results
set_seed(1)
freeze_bert = False  # if True, freeze the encoder weights and only update the classification layer weights
# training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)

#  training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)

#sorry for this, but was never working when one GPU
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)

net = nn.DataParallel(net, device_ids = [0,1,2,3])

net.to(device)

criterion = nn.CrossEntropyLoss()
opti = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-4)


train_bert(net, criterion, opti, lr, train_loader, val_loader, epochs, iters_to_accumulate)

We can see that it overfits very quickly, with the training loss continuously decreasing while the validation loss reaches a minimum after 2 epochs and then increasing

In [42]:
accuracy(net, device, train_loader)

100%|███████████████████████████████████████████████████████████████████████████████████| 97/97 [00:24<00:00,  4.00it/s]


0.9123402878861394

In [43]:
accuracy(net, device, val_loader)

100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [00:03<00:00,  3.64it/s]


0.7176128093158661

Note : these figures are not for the best model but after 4 epochs, when it is already largely overfitting

Not so bad!

To run our best model use this code

In [47]:

set_seed(1)
freeze_bert = False  # if True, freeze the encoder weights and only update the classification layer weights
# training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)

#  training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)

#sorry for this, but was never working when one GPU
net = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)

net = nn.DataParallel(net, device_ids = [0,1,2,3])

net.to(device)

criterion = nn.CrossEntropyLoss()
opti = optim.Adam(net.parameters(), lr=lr, weight_decay=1e-4)
net.load_state_dict(torch.load('modelsalbert-base-v2_fineTune_lr_2e-05_val_loss_0.6161_ep_2.pt'))

Reading training data...
Reading validation data...


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.bias', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [48]:
accuracy(net, device, val_loader)

100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [00:05<00:00,  2.00it/s]


0.86608442503639

Impressive! And it is the smallest...

In [106]:
#We check the number of trainable parameters

model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])

In [107]:
params

11685891

# Albert without fine tuning

In [39]:
device = torch.device("cpu")
freezebert=True
net2 = SentencePairClassifier(bert_model, freeze_bert=freeze_bert)

net2.to(device)

#  Set all seeds to make reproducible results
set_seed(1)

# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)
# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=bs, num_workers=5)
val_loader = DataLoader(val_set, batch_size=bs, num_workers=5)






criterion = nn.CrossEntropyLoss()
opti = optim.Adam(net2.parameters(), lr=lr, weight_decay=1e-4)


train_bert(net2, criterion, opti, lr, train_loader, val_loader, epochs)

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.bias', 'predictions.decoder.bias', 'predictions.dense.weight', 'predictions.decoder.weight', 'predictions.LayerNorm.bias', 'predictions.LayerNorm.weight', 'predictions.dense.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Reading training data...
Reading validation data...


 20%|████████████████▎                                                                  | 19/97 [07:32<30:18, 23.31s/it]


Iteration 19/97 of epoch 1 complete. Loss : 1.0994238978938053 


 39%|████████████████████████████████▌                                                  | 38/97 [14:50<22:47, 23.18s/it]


Iteration 38/97 of epoch 1 complete. Loss : 0.9854506787500883 


 59%|████████████████████████████████████████████████▊                                  | 57/97 [22:11<15:39, 23.48s/it]


Iteration 57/97 of epoch 1 complete. Loss : 0.9258737783682974 


 78%|█████████████████████████████████████████████████████████████████                  | 76/97 [29:29<08:07, 23.22s/it]


Iteration 76/97 of epoch 1 complete. Loss : 0.8104557457723116 


 98%|█████████████████████████████████████████████████████████████████████████████████▎ | 95/97 [36:43<00:47, 23.54s/it]


Iteration 95/97 of epoch 1 complete. Loss : 0.8107262975291202 


100%|███████████████████████████████████████████████████████████████████████████████████| 97/97 [37:23<00:00, 23.13s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [01:27<00:00,  7.98s/it]



Epoch 1 complete! Validation Loss : 0.7098227522589944
Best validation loss improved from inf to 0.7098227522589944



 20%|████████████████▎                                                                  | 19/97 [06:34<26:28, 20.36s/it]


Iteration 19/97 of epoch 2 complete. Loss : 0.7501355817443446 


 39%|████████████████████████████████▌                                                  | 38/97 [13:30<21:32, 21.91s/it]


Iteration 38/97 of epoch 2 complete. Loss : 0.6424448207805031 


 59%|████████████████████████████████████████████████▊                                  | 57/97 [20:22<14:38, 21.96s/it]


Iteration 57/97 of epoch 2 complete. Loss : 0.6007815677868692 


 78%|█████████████████████████████████████████████████████████████████                  | 76/97 [27:30<07:47, 22.27s/it]


Iteration 76/97 of epoch 2 complete. Loss : 0.5194309824391415 


 98%|█████████████████████████████████████████████████████████████████████████████████▎ | 95/97 [34:38<00:45, 22.62s/it]


Iteration 95/97 of epoch 2 complete. Loss : 0.564765000029614 


100%|███████████████████████████████████████████████████████████████████████████████████| 97/97 [35:14<00:00, 21.80s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [01:25<00:00,  7.80s/it]



Epoch 2 complete! Validation Loss : 0.6291261315345764
Best validation loss improved from 0.7098227522589944 to 0.6291261315345764



 20%|████████████████▎                                                                  | 19/97 [06:35<27:00, 20.77s/it]


Iteration 19/97 of epoch 3 complete. Loss : 0.48355331703236226 


 39%|████████████████████████████████▌                                                  | 38/97 [13:32<21:23, 21.76s/it]


Iteration 38/97 of epoch 3 complete. Loss : 0.4110707248512067 


 59%|████████████████████████████████████████████████▊                                  | 57/97 [20:33<14:19, 21.48s/it]


Iteration 57/97 of epoch 3 complete. Loss : 0.3717924395674153 


 78%|█████████████████████████████████████████████████████████████████                  | 76/97 [27:32<07:32, 21.57s/it]


Iteration 76/97 of epoch 3 complete. Loss : 0.32454180795895426 


 98%|█████████████████████████████████████████████████████████████████████████████████▎ | 95/97 [34:24<00:42, 21.30s/it]


Iteration 95/97 of epoch 3 complete. Loss : 0.3859226687958366 


100%|███████████████████████████████████████████████████████████████████████████████████| 97/97 [35:02<00:00, 21.68s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [01:26<00:00,  7.86s/it]



Epoch 3 complete! Validation Loss : 0.676251389763572


 20%|████████████████▎                                                                  | 19/97 [06:47<27:31, 21.17s/it]


Iteration 19/97 of epoch 4 complete. Loss : 0.3233075690896888 


 39%|████████████████████████████████▌                                                  | 38/97 [13:38<21:32, 21.91s/it]


Iteration 38/97 of epoch 4 complete. Loss : 0.2620254504053216 


 59%|████████████████████████████████████████████████▊                                  | 57/97 [20:19<13:55, 20.88s/it]


Iteration 57/97 of epoch 4 complete. Loss : 0.26093638570685135 


 78%|█████████████████████████████████████████████████████████████████                  | 76/97 [27:18<07:48, 22.32s/it]


Iteration 76/97 of epoch 4 complete. Loss : 0.30649037345459584 


 98%|█████████████████████████████████████████████████████████████████████████████████▎ | 95/97 [34:19<00:44, 22.08s/it]


Iteration 95/97 of epoch 4 complete. Loss : 0.3368364352928965 


100%|███████████████████████████████████████████████████████████████████████████████████| 97/97 [34:58<00:00, 21.63s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [01:31<00:00,  8.36s/it]



Epoch 4 complete! Validation Loss : 0.789167192849246
The model has been saved in modelsalbert-base-v2_lr_2e-05_val_loss_0.62913_ep_2.pt


Ridicuously slow on the CPU, but all GPU were busy...

In [60]:
#Load the best model that we have trained without fine-tuning
model2 = SentencePairClassifier(freeze_bert=True)
model2.load_state_dict(torch.load('modelsalbert-base-v2_lr_2e-05_val_loss_0.62913_ep_2.pt'))

Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.LayerNorm.bias', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.bias', 'predictions.decoder.weight']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

In [63]:
accuracy(model2,'cpu', train_loader)

100%|███████████████████████████████████████████████████████████████████████████████████| 97/97 [13:17<00:00,  8.22s/it]


0.8272683163512858

In [64]:
accuracy(model2,'cpu', val_loader)

100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [01:30<00:00,  8.20s/it]


0.8326055312954876

That is huge

In [108]:
#We check the number of trainable parameters

model2_parameters = filter(lambda p: p.requires_grad, model2.parameters())
params2 = sum([np.prod(p.size()) for p in model2_parameters])

In [109]:
params2

2307

We have a look at examples that it is failing to predict

In [71]:
val_load_all=DataLoader(val_set, batch_size=687, num_workers=5)

In [None]:
model2.eval()
with torch.no_grad():
    for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(val_load_all)):
        pred = model2(seq,attn_masks,token_type_ids)

In [79]:
predic=torch.argmax(pred,dim=-1)

In [81]:
df_val['pred']=predic.numpy()

In [84]:
df_val['true'] = df_val['label'].apply(np.argmax)

In [87]:
wrong_predic=df_val[df_val['true']!=df_val['pred']]

In [99]:
pd.set_option('display.max_colwidth', 1000)

In [101]:
wrong_predic

Unnamed: 0,index,premise,hypothesis,label,pred,true
4,3079,"No, I don't know.","I don't know what she said, no.","[0, 1, 0]",0,1
11,6992,"According to the Natural Resources Conservation Service, this single, voluntary program will provide flexible technical, financial, and educational assistance to farmers and ranchers who face serious threats to soil, water, and related natural resources on agricultural and other lands, including grazing lands, wetlands, forest lands, and wildlife habitats.",Farmers and ranchers must have all of their licenses and permits to qualify.,"[0, 1, 0]",2,1
12,243,well they're so close to an undefeated undefeated season they can taste it and they wanna make history so i don't think they're gonna lack for motivation,"Unless they suffer any losses, they'll remain motivated.","[0, 1, 0]",2,1
21,11207,"Since there is no airport on the island, all visitors must arrive at the port, Skala, where most of the hotels are located and all commercial activity is carried out.",The best way to get onto the island is by plane.,"[0, 0, 1]",1,2
27,6088,"Yes, it does, admitted Tuppence.",Tuppence wasn't very happy about admitting it did.,"[0, 1, 0]",2,1
...,...,...,...,...,...,...
647,5141,and the same is true of the drug hangover you know if you,It's just like a drug hangover but worse.,"[0, 1, 0]",2,1
651,4706,"Sit down, will you?"" Tuppence sat down on the chair facing him.",He asked Tuppence to sit on a red chair.,"[0, 1, 0]",2,1
668,11686,"Under Deng Xiaoping, Beijing actively sought to cultivate a good bilateral relationship.",Beijing sought to create a good relationship with Hong Kong.,"[0, 1, 0]",2,1
669,3336,The sacred is not mysterious to her.,The woman does not know anything sacred.,"[0, 0, 1]",0,2


# Albert + URN

Base model: the (Al)Bert embedding for each word is projected in k(=105) dimensions, before the sentence is coded through the multiplication of the corresponding orthogonal matrices. The coding of both sentences are then concatenated. One or several linear layers classify the output. (Plus: simple ; Minus: what it is learning is basically just another way of encoding a sentence from the same tokens encoding. I am not sure the nice properties of the URN are anyway involved)

In addition the data section, all the cells below should be executed before training or importing any of the Albert+URN models

In [25]:
device = torch.device("cuda:0")

In [49]:
import math
class UnitaryRNN(nn.Module):
    def __init__(self, embedding_size):
        super().__init__()
        self.embedding_size = embedding_size
        n = math.ceil(.5+math.sqrt(2*self.embedding_size+.25))   #we need n(n-1)/2 >= embedding_size
        self.n = n                                               #in fact it is now implemented in such a way that we need
                                                                 #the expression in paranthesis to be an integer
        # for creating the upper tringulars
        self.ix_mat = torch.zeros(n,n).long()
        for i in range(0,n):
            for j in range(i+1,n):
                self.ix_mat[i,j] = (i* (2*n - i - 3))//2 + j - 1 + 1

    def forward(self, text):

        device = text.device
        x = torch.cat([torch.zeros(text.shape[:-1]).to(device).unsqueeze(-1), text], dim=-1)
        tri = torch.index_select(x, -1, self.ix_mat.flatten().to(device)).reshape((*text.shape[:-1],self.n,self.n))
        tri = tri - tri.transpose(-2, -1)
        exp_mat = torch.matrix_exp(tri)
        
        h = torch.zeros((text.shape[0],self.n)).to(device)
        h[:,0] = 1
        #print('h_0',h.size())
        for i in range(text.shape[1]):
            h = torch.einsum('bij,bj->bi',exp_mat[:,i,:,:], h)  #batch matrices multiplication
            #print('h',h.size())
        return h

In [50]:
device = torch.device("cuda:0")
def train(net, criterion, opti, lr, train_loader, val_loader, epochs,device=device):

    best_loss = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # print the training loss 5 times per epoch
    iters = []
    train_losses = []
    val_losses = []

    scaler = GradScaler()

    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):
            opti.zero_grad()
            # Converting to cuda tensors
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
    
            
            logits = net(seq, attn_masks, token_type_ids)

            # Computing loss
            
            loss = criterion(logits.view((labels.size()[0],-1)), labels.float())
            
            loss.backward()
            opti.step()
            
                        


            running_loss += loss.item()

            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                running_loss = 0.0


        val_loss = eval_loss(net, device, criterion, val_loader)  # Compute validation loss
        print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))

        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1

    # Saving the model
    path_to_model='models{}_Alb_URN_val_loss_{}_ep_{}.pt'.format(bert_model, lr, round(best_loss, 5), best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()

In [51]:
def eval_loss(net, device, criterion, dataloader):
    net.eval()

    mean_loss = 0
    count = 0

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.view((labels.size()[0],-1)), labels.float()).item()
            count += 1

    return mean_loss / count

In [52]:
class URN_Albert_base_NL(nn.Module):

    def __init__(self, bert_model="albert-base-v2", freeze_bert=False,embedding_size=105):
        super(URN_Albert_base_NL, self).__init__()
        self.embedding_size = embedding_size
        #  Instantiating BERT-based model object
        self.bert_layer = AutoModel.from_pretrained(bert_model,output_hidden_states = True)

        
        if bert_model == "albert-base-v2":  # 12M parameters
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 18M parameters
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 60M parameters
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 235M parameters
            hidden_size = 4096
        elif bert_model == "bert-base-uncased": # 110M parameters
            hidden_size = 768

        # Freeze bert layers and only train the projection and the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        #URN layer
        self.urn_layer = UnitaryRNN(self.embedding_size)
        
        #sigmoid
        self.sigmoid=nn.Sigmoid()
        
        #Projection layers from dim 768 to 105
        self.lin1 = nn.Linear(768, 64)
        self.proj = nn.Linear(64,105)
        
        # Classification layer
        self.cls_layer = nn.Linear(30, 3)

        self.dropout = nn.Dropout(p=0.1)

    @autocast()  # run in mixed precision (a trick to try and spare time)
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''
        device = input_ids.device
        bs = input_ids.size()[0]  #batch size
        end_sentence1 = torch.argmax(token_type_ids,dim=1)-2 #indice of the last word of the first sentence(that is not CLS)
        #print(end_sentence1)
        end_sentence2 = -torch.argmax(torch.flip(token_type_ids,dims=(1,)),dim=1)+214 #trick to get the last one
        pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)['hidden_states'][-1]
        lis_sent1= []
        lis_sent2= []
        for i in range(bs):          #loop on the batch for slicing and padding
            sent1 = pooler_output[i,1:end_sentence1[i],:] #get the sentence 1 without separators Bert encoding of each word in 768 dim
            sent1_padded = torch.cat((sent1,torch.zeros((220-sent1.size()[0],sent1.size()[1]),device=device)),dim=0)
            sent2 = pooler_output[i,(end_sentence1[i]+3):end_sentence2[i],:]
            sent2_padded = torch.cat((sent2,torch.zeros((64-sent2.size()[0],sent2.size()[1]),device=device)),dim=0)
            lis_sent1.append(sent1_padded)
            lis_sent2.append(sent2_padded)
        
        sent1 = torch.stack(lis_sent1)  #batch of padded Bert encoded sentences
        sent2 = torch.stack(lis_sent2)
        # for the URN layer to work on batches we need to pad with 0s sent1 and sent2
        sent1 = self.sigmoid(self.lin1(sent1))
        sent1_red = self.proj(sent1)  #size (batch_size, 220, 105)
        #sent1_red = torch.cat((sent1_red,torch.zeros((bs,220-sent1_red.size()[1],sent1_red.size()[2]),device=device)),dim=1) #we pad to the longest premisse 196 tokens
        sent2 = self.sigmoid(self.lin1(sent2))
        sent2_red = self.proj(sent2)  #size (batch_size, 64, 105)
        
        sent1_red =self.urn_layer(sent1_red)   #size (batch_size, 15)
        sent2_red =self.urn_layer(sent2_red)
        
        pooler_output = torch.cat((sent1_red,sent2_red),dim=1)  #concatenate the outputs of the URN
        
             
        logits = self.cls_layer(self.dropout(pooler_output))
        
        return logits

# Albert + URN naive

In [None]:
#run this to train a new model
device = torch.device("cuda:0")
#  Set all seeds to make reproducible results
set_seed(1)

# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)

# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=64, num_workers=4)
val_loader = DataLoader(val_set, batch_size=64, num_workers=4)

net11 = URN_Albert_base_NL()

net11 = nn.DataParallel(net11, device_ids = [0,1,2,3])

net11.to(device)

criterion = nn.CrossEntropyLoss()
opti = optim.Adam(net11.parameters(), lr=1e-5, weight_decay=1e-4)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader)  # The total number of training steps


train(net11, criterion, opti, lr, train_loader, val_loader, epochs)


Some learning happens!

In [53]:
#run this to load the best model
device = torch.device("cuda:0")
#  Set all seeds to make reproducible results
set_seed(1)

# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)

# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=64, num_workers=4)
val_loader = DataLoader(val_set, batch_size=64, num_workers=4)

net11 = URN_Albert_base_NL()

net11 = nn.DataParallel(net11, device_ids = [0,1,2,3])

net11.to(device)
net11.load_state_dict(torch.load('modelsalbert-base-v2_Alb_URN_val_loss_2e-05_ep_0.7852.pt'))

Reading training data...
Reading validation data...


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.bias', 'predictions.decoder.bias', 'predictions.dense.bias', 'predictions.dense.weight', 'predictions.LayerNorm.weight', 'predictions.decoder.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

# The Model Albert+URN refined

(one of) the problems of training the URN on such a database is that numerous words will never be encountered in the training set. Nevertheless we would like our model to use the cosine similarity of these new worlds with some known ones to generalise the heuristic it may have learned. For example, if the model has learned that in the pair : "All birds fly. Some bird fly." the second is consequence of the first, we want it to identify the new hypothesis : "all ravens fly." as a rightful consequence of the same premisse.
One way to do this would be to encode all words of the consequence but one in orthogonal matrices. so that the encoded sentence would become $O_1,..O_{k-1},w_k,O_{k+1}...O_n$ where O_i are orthogonal matrices, w_k is a vector, n is the number of words in the sentence and k is the position of the untouched (not transformed in an ortho matrix) word in the sentence. Then, to benefit from the property that 2 vectors v and v' $<\prod_{i=k+1}^{n}O_i v,\prod_{i=k+1}^{n}O_i v'> = <v,v'>$ (In reality we will need $t(\prod_{i=k+1}^{n}O_i v)t(\prod_{i=1}^{k-1}O_i)$ to process each word both by the left and the right side of the sentence, but the idea stays the same) w_k needs to be projected onto a space of the same dimension as the orthogonal matrices

In [26]:
#input: batch of sentences (words embedded in 105 dim - text) + batch of of sentences (with words embedded in 15 dim - h)
#output:for each word (in position i) in the sentence, returns $\Pi_{i+1}^n O_i v_i$. In words, the ith word is 'processed through
#the right side of the sentence.

import math
class UnitaryRNNright(nn.Module):
    def __init__(self, embedding_size):
        super().__init__()
        self.embedding_size = embedding_size
        n = math.ceil(.5+math.sqrt(2*self.embedding_size+.25))   #we need n(n-1)/2 >= embedding_size
        self.n = n                                               #in fact it is now implemented in such a way that we need
                                                                 #the expression in paranthesis to be an integer
        # for creating the upper tringulars
        self.ix_mat = torch.zeros(n,n).long()
        for i in range(0,n):
            for j in range(i+1,n):
                self.ix_mat[i,j] = (i* (2*n - i - 3))//2 + j - 1 + 1

    def forward(self, text, h):

        device = text.device
        x = torch.cat([torch.zeros(text.shape[:-1]).to(device).unsqueeze(-1), text], dim=-1)
        tri = torch.index_select(x, -1, self.ix_mat.flatten().to(device)).reshape((*text.shape[:-1],self.n,self.n))
        tri = tri - tri.transpose(-2, -1)
        exp_mat = torch.matrix_exp(tri)
        list_results=[]
        for i in range (text.shape[1]):
            h0 = torch.clone(h[:,i,:]).detach()
            for j in range(i+1,text.shape[1]):
                h0 = torch.einsum('bij,bj->bi',exp_mat[:,j,:,:], h0)  #batch matrices multiplication
                #print('h',h.size())
            list_results.append(h0)
            
        return torch.stack(list_results,dim=1)  #(batch_size,sentence_len, 15)
        

In [27]:
#input: batch of sentences (words embedded in 105 dim - text) + batch of of sentences (with words embedded in 15 dim - h)
#output:for each word (in position i) in the sentence, returns $\t(v_i)t(Pi_0^{i-1}) O_i$. In words, the ith word is 'processed through
#the leftt side of the sentence.

import math
class UnitaryRNNleft(nn.Module):
    def __init__(self, embedding_size):
        super().__init__()
        self.embedding_size = embedding_size
        n = math.ceil(.5+math.sqrt(2*self.embedding_size+.25))   #we need n(n-1)/2 >= embedding_size
        self.n = n                                               #in fact it is now implemented in such a way that we need
                                                                 #the expression in paranthesis to be an integer
        # for creating the upper tringulars
        self.ix_mat = torch.zeros(n,n).long()
        for i in range(0,n):
            for j in range(i+1,n):
                self.ix_mat[i,j] = (i* (2*n - i - 3))//2 + j - 1 + 1

    def forward(self, text, h):

        device = text.device
        x = torch.cat([torch.zeros(text.shape[:-1]).to(device).unsqueeze(-1), text], dim=-1)
        tri = torch.index_select(x, -1, self.ix_mat.flatten().to(device)).reshape((*text.shape[:-1],self.n,self.n))
        tri = tri - tri.transpose(-2, -1)
        exp_mat = torch.matrix_exp(tri)
        list_results=[]
        for i in range (text.shape[1]):
            h0 = torch.clone(h[:,i,:]).detach()
            for j in range(i-1,-1,-1):
                h0 = torch.einsum('bij,bj->bi',exp_mat[:,j,:,:], h0)  #batch matrices multiplication
                    #in fact we retrun the transposed vectors, which is the same 
            list_results.append(h0)
            
        return torch.stack(list_results,dim=1)  #(batch_size,sentence_len, 15)
                                                
        

In [26]:
# A first try that fails to learn
class AlbURN_NL(nn.Module):

    def __init__(self, bert_model="albert-base-v2", freeze_bert=False,embedding_size=105):
        super(AlbURN_NL, self).__init__()
        self.embedding_size = embedding_size
        #  Instantiating BERT-based model object
        self.bert_layer = AutoModel.from_pretrained(bert_model,output_hidden_states = True)

        
        if bert_model == "albert-base-v2":  # 12M parameters
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 18M parameters
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 60M parameters
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 235M parameters
            hidden_size = 4096
        elif bert_model == "bert-base-uncased": # 110M parameters
            hidden_size = 768

        # Freeze bert layers and only train the projection and the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        #URN layer
        self.urn_right = UnitaryRNNright(self.embedding_size)
        self.urn_left = UnitaryRNNleft(self.embedding_size)
        
        #sigmoid
        self.sigmoid=nn.Sigmoid()
        #Projection layer from dim 768 to 105
        self.lin1 = nn.Linear(768, 64)
        
        
        #Projection layer from dim 768 to 105
        
        self.proj1 = nn.Linear(64,105) #get the text input
        
        self.proj2 = nn.Linear(64,15) #get the h input for the right URN
        
        # Classification layer
        # this time we need to classify based on a (batch_size, sent_len, 15) tensor
        # our hope is that at least one of the sent_len vectors is very informative in relation to the classification result
        # but we don't know which one... Maybe here an attention layer could answer the question? (next model)
        # for now: flatten + linear
        
        self.flatten = nn.Flatten()
        self.cls_layer = nn.Linear(284*15, 3)

        self.dropout = nn.Dropout(p=0.1)

    @autocast()  # run in mixed precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''
        device = input_ids.device
        bs = input_ids.size()[0]  #batch size
        end_sentence1 = torch.argmax(token_type_ids,dim=1)-2 #indice of the last word of the first sentence(that is not CLS)
        #print(end_sentence1)
        end_sentence2 = -torch.argmax(torch.flip(token_type_ids,dims=(1,)),dim=1)+214 #trick to get the last one
        pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)['hidden_states'][-1]
        lis_sent1= []
        lis_sent2= []
        for i in range(bs):          #loop on the batch for slicing and padding
            sent1 = pooler_output[i,1:end_sentence1[i],:] #get the sentence 1 without separators Bert encoding of each word in 768 dim
            sent1_padded = torch.cat((sent1,torch.zeros((220-sent1.size()[0],sent1.size()[1]),device=device)),dim=0)
            sent2 = pooler_output[i,(end_sentence1[i]+3):end_sentence2[i],:]
            sent2_padded = torch.cat((sent2,torch.zeros((64-sent2.size()[0],sent2.size()[1]),device=device)),dim=0)
            lis_sent1.append(sent1_padded)
            lis_sent2.append(sent2_padded)
        
        sent1 = torch.stack(lis_sent1)  #batch of padded Bert encoded sentences
        sent2 = torch.stack(lis_sent2)
        # for the URN layer to work on batches we need to pad with 0s sent1 and sent2
        sent1 = self.lin1(sent1)
        text1 = self.sigmoid(sent1)
        text1 = self.proj1(text1)  #size (batch_size, 220, 105)
        #sent1_red = torch.cat((sent1_red,torch.zeros((bs,220-sent1_red.size()[1],sent1_red.size()[2]),device=device)),dim=1) #we pad to the longest premisse 196 tokens
        sent2 = self.lin1(sent2)
        text2 = self.sigmoid(sent2)
        text2 = self.proj1(text2)  #size (batch_size, 64, 105)
        #sent2_red = torch.cat((sent2_red,torch.zeros((bs,64-sent2_red.size()[1],sent2_red.size()[2]),device=device)),dim=1)
        h1 = self.proj2(sent1)  #(bs,220,15)
        h2 = self.proj2(sent2)  #(bs,64,15)
        
        sent1_right_proc =self.urn_right(text1, h1)   #size (batch_size, 15)
        #print('sent1_red',sent1_red.size())
        sent2_right_proc =self.urn_right(text2, h2)
        sent1_proc = self.urn_left(text1, sent1_right_proc) #(bs, 220, 15)
        sent2_proc = self.urn_left(text2, sent2_right_proc) #(bs, 64, 15)
        pooler_output = torch.cat((sent1_proc,sent2_proc),dim=1)  #concatenate the outputs of the URN (bs, 284,15)
        
        pooler_output = self.flatten(pooler_output)
        #print('pooler_out',pooler_output.size())

        
        logits = self.cls_layer(self.dropout(pooler_output))
        
        return logits

In [28]:
#Second version - we disconnect the learning for h and for the orthogonal matrices
#This one learns but slowly
class AlbURN_NL_sep(nn.Module):

    def __init__(self, bert_model="albert-base-v2", freeze_bert=False,embedding_size=105):
        super(AlbURN_NL_sep, self).__init__()
        self.embedding_size = embedding_size
        #  Instantiating BERT-based model object
        self.bert_layer = AutoModel.from_pretrained(bert_model,output_hidden_states = True)

        
        if bert_model == "albert-base-v2":  # 12M parameters
            hidden_size = 768
        elif bert_model == "albert-large-v2":  # 18M parameters
            hidden_size = 1024
        elif bert_model == "albert-xlarge-v2":  # 60M parameters
            hidden_size = 2048
        elif bert_model == "albert-xxlarge-v2":  # 235M parameters
            hidden_size = 4096
        elif bert_model == "bert-base-uncased": # 110M parameters
            hidden_size = 768

        # Freeze bert layers and only train the projection and the classification layer weights
        if freeze_bert:
            for p in self.bert_layer.parameters():
                p.requires_grad = False

        #URN layer
        self.urn_right = UnitaryRNNright(self.embedding_size)
        self.urn_left = UnitaryRNNleft(self.embedding_size)
        
        #sigmoid
        self.sigmoid=nn.Sigmoid()
        #Projection layer from dim 768 to 105
        self.lin1 = nn.Linear(768, 64)
        
        
        #Projection layer from dim 768 to 105
        
        self.proj1 = nn.Linear(64,105) #get the text input
        
        self.proj2 = nn.Linear(768,15) #get the h input for the right URN directly out of Bert
        
        # Classification layer
        # this time we need to classify based on a (batch_size, sent_len, 15) tensor
        # our hope is that at least one of the sent_len vectors is very informative in relation to the classification result
        # but we don't know which one... Maybe here an attention layer could answer the question? (next model)
        # for now: flatten + linear
        
        self.flatten = nn.Flatten()
        self.cls_layer = nn.Linear(284*15, 3)

        self.dropout = nn.Dropout(p=0.1)

    @autocast()  # run in mixed precision
    def forward(self, input_ids, attn_masks, token_type_ids):
        '''
        Inputs:
            -input_ids : Tensor  containing token ids
            -attn_masks : Tensor containing attention masks to be used to focus on non-padded values
            -token_type_ids : Tensor containing token type ids to be used to identify sentence1 and sentence2
        '''
        device = input_ids.device
        bs = input_ids.size()[0]  #batch size
        end_sentence1 = torch.argmax(token_type_ids,dim=1)-2 #indice of the last word of the first sentence(that is not CLS)
        #print(end_sentence1)
        end_sentence2 = -torch.argmax(torch.flip(token_type_ids,dims=(1,)),dim=1)+214 #trick to get the last one
        pooler_output = self.bert_layer(input_ids, attn_masks, token_type_ids)['hidden_states'][-1]
        lis_sent1= []
        lis_sent2= []
        for i in range(bs):          #loop on the batch for slicing and padding
            sent1 = pooler_output[i,1:end_sentence1[i],:] #get the sentence 1 without separators Bert encoding of each word in 768 dim
            sent1_padded = torch.cat((sent1,torch.zeros((220-sent1.size()[0],sent1.size()[1]),device=device)),dim=0)
            sent2 = pooler_output[i,(end_sentence1[i]+3):end_sentence2[i],:]
            sent2_padded = torch.cat((sent2,torch.zeros((64-sent2.size()[0],sent2.size()[1]),device=device)),dim=0)
            lis_sent1.append(sent1_padded)
            lis_sent2.append(sent2_padded)
        
        sent1 = torch.stack(lis_sent1)  #batch of padded Bert encoded sentences
        sent2 = torch.stack(lis_sent2)
        # for the URN layer to work on batches we need to pad with 0s sent1 and sent2
        text1 = self.lin1(sent1)
        text1 = self.sigmoid(text1)
        text1 = self.proj1(text1)  #size (batch_size, 220, 105)
        #sent1_red = torch.cat((sent1_red,torch.zeros((bs,196-sent1_red.size()[1],sent1_red.size()[2]),device=device)),dim=1) #we pad to the longest premisse 196 tokens
        text2 = self.lin1(sent2)
        text2 = self.sigmoid(text2)
        text2 = self.proj1(text2)  #size (batch_size, 64, 105)
        #sent2_red = torch.cat((sent2_red,torch.zeros((bs,64-sent2_red.size()[1],sent2_red.size()[2]),device=device)),dim=1)
        h1 = self.proj2(sent1)  #(bs,220,15)
        h2 = self.proj2(sent2)  #(bs,64,15)
        
        sent1_right_proc =self.urn_right(text1, h1)   #size (batch_size, 15)
        
        sent2_right_proc =self.urn_right(text2, h2)
        sent1_proc = self.urn_left(text1, sent1_right_proc) #(bs, 220, 15)
        sent2_proc = self.urn_left(text2, sent2_right_proc) #(bs, 64, 15)
        pooler_output = torch.cat((sent1_proc,sent2_proc),dim=1)  #concatenate the outputs of the URN (bs, 284,15)
        
        pooler_output = self.flatten(pooler_output)
       

        
        logits = self.cls_layer(self.dropout(pooler_output))
        
        return logits

In [24]:
device = torch.device("cuda:0")
def train(net, criterion, opti, lr, train_loader, val_loader, epochs,device=device):

    best_loss = np.Inf
    best_train_loss = np.Inf
    rl_prec = np.Inf
    best_ep = 1
    nb_iterations = len(train_loader)
    print_every = nb_iterations // 5  # print the training loss 5 times per epoch
    iters = []
    train_losses = []
    val_losses = []

    scaler = GradScaler() #necessary with autograder

    for ep in range(epochs):

        net.train()
        running_loss = 0.0
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(train_loader)):
            opti.zero_grad()
            # Converting to cuda tensors
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
    
            # Enables autocasting for the forward pass (model + loss)
            #with autocast():
                # Obtaining the logits from the model
            logits = net(seq, attn_masks, token_type_ids)

            # Computing loss
            
            loss = criterion(logits.view((labels.size()[0],-1)), labels.float())
            #loss = loss / iters_to_accumulate  # Normalize the loss because it is averaged
            train_losses.append(loss)
            loss.backward()
            opti.step()
            
            
            running_loss += loss.item()

            if (it + 1) % print_every == 0:  # Print training loss information
                print()
                print("Iteration {}/{} of epoch {} complete. Loss : {} "
                      .format(it+1, nb_iterations, ep+1, running_loss / print_every))

                if (val_losses==[]) & (rl_prec>running_loss):
                    net_copy = copy.deepcopy(net)
                    path_to_model='models{}_Alb_URN_NL_ep_0.pt'.format(net.__class__.__name__) #save more often for the server disconnections
                    torch.save(net_copy.state_dict(), path_to_model)
                    print("The model has been saved in {}".format(path_to_model))
                rl_prec = running_loss
                running_loss = 0.0


        val_loss = eval_loss(net, device, criterion, val_loader)  # Compute validation loss
        print()
        print("Epoch {} complete! Validation Loss : {}".format(ep+1, val_loss))
        val_losses.append(val_loss)
        if val_loss < best_loss:
            print("Best validation loss improved from {} to {}".format(best_loss, val_loss))
            print()
            net_copy = copy.deepcopy(net)  # save a copy of the model
            best_loss = val_loss
            best_ep = ep + 1

    # Saving the model
    path_to_model='models{}_Alb_URN_NL_val_loss_{}_ep_{}.pt'.format(net.__class__.__name__, best_ep)
    torch.save(net_copy.state_dict(), path_to_model)
    print("The model has been saved in {}".format(path_to_model))

    del loss
    torch.cuda.empty_cache()
    return train_losses, val_losses

In [25]:
def eval_loss(net, device, criterion, dataloader):
    net.eval()

    mean_loss = 0
    count = 0

    with torch.no_grad():
        for it, (seq, attn_masks, token_type_ids, labels) in enumerate(tqdm(dataloader)):
            seq, attn_masks, token_type_ids, labels = \
                seq.to(device), attn_masks.to(device), token_type_ids.to(device), labels.to(device)
            logits = net(seq, attn_masks, token_type_ids)
            mean_loss += criterion(logits.view((labels.size()[0],-1)), labels.float()).item()
            count += 1

    return mean_loss / count

In [None]:
device = torch.device("cuda:0")
#  Set all seeds to make reproducible results
set_seed(1)

# Creating instances of training and validation set
print("Reading training data...")
train_set = CustomDataset(df_train, maxlen, bert_model)
print("Reading validation data...")
val_set = CustomDataset(df_val, maxlen, bert_model)

# Creating instances of training and validation dataloaders
train_loader = DataLoader(train_set, batch_size=64, num_workers=4)
val_loader = DataLoader(val_set, batch_size=64, num_workers=4)

net13 = AlbURN_NL_sep()

net13 = nn.DataParallel(net13, device_ids = [0,1,2,3])

net13.to(device)
net13.load_state_dict(torch.load('modelsDataParallel_Alb_URN_NL_ep_0.pt'))
criterion = nn.CrossEntropyLoss()
opti = optim.Adam(net13.parameters(), lr=1e-5, weight_decay=1e-4)
num_warmup_steps = 0 # The number of steps for the warmup phase.
num_training_steps = epochs * len(train_loader)  # The total number of training steps


train_l,val_l = train(net13, criterion, opti, lr, train_loader, val_loader, 8)

Reading training data...
Reading validation data...


Some weights of the model checkpoint at albert-base-v2 were not used when initializing AlbertModel: ['predictions.decoder.bias', 'predictions.dense.bias', 'predictions.decoder.weight', 'predictions.bias', 'predictions.LayerNorm.weight', 'predictions.dense.weight', 'predictions.LayerNorm.bias']
- This IS expected if you are initializing AlbertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing AlbertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
 19%|███████████████▍                                                                   | 18/97 [09:04<41:30, 31.52s/it]


Iteration 19/97 of epoch 1 complete. Loss : 1.077095433285362 


 20%|████████████████▎                                                                  | 19/97 [09:31<39:14, 30.18s/it]

The model has been saved in modelsDataParallel_Alb_URN_NL_ep_0.pt


 39%|████████████████████████████████▌                                                  | 38/97 [18:32<27:28, 27.94s/it]


Iteration 38/97 of epoch 1 complete. Loss : 1.0777917159231085 


 59%|████████████████████████████████████████████████▊                                  | 57/97 [27:37<19:03, 28.59s/it]


Iteration 57/97 of epoch 1 complete. Loss : 1.089080810546875 


 78%|█████████████████████████████████████████████████████████████████                  | 76/97 [36:44<10:06, 28.86s/it]


Iteration 76/97 of epoch 1 complete. Loss : 1.0957629555150081 


 98%|█████████████████████████████████████████████████████████████████████████████████▎ | 95/97 [45:59<00:58, 29.25s/it]


Iteration 95/97 of epoch 1 complete. Loss : 1.0963227121453536 


100%|███████████████████████████████████████████████████████████████████████████████████| 97/97 [46:56<00:00, 29.03s/it]
100%|███████████████████████████████████████████████████████████████████████████████████| 11/11 [02:01<00:00, 11.09s/it]



Epoch 1 complete! Validation Loss : 1.0828220952640881
Best validation loss improved from inf to 1.0828220952640881



 19%|███████████████▍                                                                   | 18/97 [10:15<52:46, 40.08s/it]

Some learning is taking place, but it is very slow