##### TwinBert https://arxiv.org/pdf/2002.06275v1.pdf

In [1]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 2.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 40.2 MB/s 
Collecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 66.5 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1


In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

#### The dataset is Quora questions pairs dataset. Ideally in the paper the authors have trained the model to use it as a backend for a sponsored search engine, to delivers ads alongside the organic search results.

In [5]:
df = pd.read_csv("train.tsv", sep='\t')   # Dataset : https://www.kaggle.com/c/quora-question-pairs

## Dataset Loader for the Siamese Network 

In [3]:
class SiameseNetworkDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.question1 = dataframe.question1
        self.question2 = dataframe.question2
        self.targets = dataframe.is_duplicate
        self.max_len = max_len
        
    def __len__(self):
        return len(self.data)
    
    
    def tokenize(self,input_text):
        input_text = " ".join(input_text.split())

        inputs = self.tokenizer.encode_plus(
            input_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        return ids,mask,token_type_ids

    def __getitem__(self, index):
        ids1,mask1,token_type_ids1 = self.tokenize(str(self.question1[index]))
        ids2,mask2,token_type_ids2 = self.tokenize(str(self.question2[index]))
        


        return {
            'ids': [torch.tensor(ids1, dtype=torch.long),torch.tensor(ids2, dtype=torch.long)],
            'mask': [torch.tensor(mask1, dtype=torch.long),torch.tensor(mask2, dtype=torch.long)],
            'token_type_ids': [torch.tensor(token_type_ids1, dtype=torch.long),torch.tensor(token_type_ids2, dtype=torch.long)],
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }
   

## TwinBert architecture

In [15]:
class TwinBert(nn.Module):
    def __init__(self):
        super(TwinBert, self).__init__()
        self.model = transformers.BertModel.from_pretrained('bert-base-uncased')
    def forward_once(self, ids, mask, token_type_ids):
        _, output= self.model(ids, attention_mask = mask, token_type_ids = token_type_ids, return_dict=False)
        return output
    def forward(self, ids, mask, token_type_ids):
        output1 = self.forward_once(ids[0],mask[0], token_type_ids[0])
        output2 = self.forward_once(ids[1],mask[1], token_type_ids[1])
        return output1,output2
        

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
model = TwinBert()
model.to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


TwinBert(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  

In [16]:
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 10
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [17]:
train_size = 0.8
train_dataset=df.sample(frac=train_size,random_state=200).reset_index(drop=True)
test_dataset=df.drop(train_dataset.index).reset_index(drop=True)


print("FULL Dataset: {}".format(df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = SiameseNetworkDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = SiameseNetworkDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (51456, 6)
TRAIN Dataset: (41165, 6)
TEST Dataset: (10291, 6)


In [18]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

# Loss Function
### A contrastive loss function that takes cosine similarity as a metric to measure the distance.

In [19]:
class CosineContrastiveLoss(nn.Module):
    def __init__(self, margin=0.4):
        super(CosineContrastiveLoss, self).__init__()
        self.margin = margin

    def forward(self, output1, output2, label):
        cos_sim = F.cosine_similarity(output1, output2)
        loss_cos_con = torch.mean((1-label) * torch.div(torch.pow((1.0-cos_sim), 2), 4) +
                                    (label) * torch.pow(cos_sim * torch.lt(cos_sim, self.margin), 2))
        return loss_cos_con

In [20]:
criterion = CosineContrastiveLoss()
optimizer = optim.Adam(model.parameters(),lr = 0.0005 )

# Training

In [None]:
for epoch in range(EPOCHS):
  model.train()
  for _,data in enumerate(training_loader, 0):
      ids,mask,token_type_ids = data['ids'],data['mask'],data['token_type_ids'] 
      targets = data['targets'].to(device, dtype = torch.float)
      ids = [ids[0].to(device, dtype = torch.long),ids[1].to(device, dtype = torch.long)]
      mask = [mask[0].to(device, dtype = torch.long),mask[1].to(device, dtype = torch.long)]
      token_type_ids = [token_type_ids[0].to(device, dtype = torch.long),token_type_ids[1].to(device, dtype = torch.long)]
      output1,output2 = model(ids, mask, token_type_ids)
      optimizer.zero_grad()
      loss = criterion(output1,output2,targets)
      if _%5==0:
          print(f'Step: {_}, Epoch: {epoch}, Loss:  {loss.item()}')
      
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

Step: 0, Epoch: 0, Loss:  7.24391640005706e-08


# Validation

In [None]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids,mask,token_type_ids = data['ids'],data['mask'],data['token_type_ids'] 
            targets = data['targets'].to(device, dtype = torch.float)
            ids = [ids[0].to(device, dtype = torch.long),ids[1].to(device, dtype = torch.long)]
            mask = [mask[0].to(device, dtype = torch.long),mask[1].to(device, dtype = torch.long)]
            token_type_ids = [token_type_ids[0].to(device, dtype = torch.long),token_type_ids[1].to(device, dtype = torch.long)]
            targets = data['targets'].to(device, dtype = torch.float)
            output1,output2 = model(ids, mask, token_type_ids)
            cos_sim = F.cosine_similarity(output1, output2)
            in_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(cos_sim).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation()
outputs = np.array(outputs) >= 0.5
accuracy = metrics.accuracy_score(targets, outputs)
f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
print(f"Accuracy Score = {accuracy}")
print(f"F1 Score (Micro) = {f1_score_micro}")
print(f"F1 Score (Macro) = {f1_score_macro}")