<a href="https://colab.research.google.com/github/yugpsyfer/Playing_with_PyTorch/blob/main/Paraphrasing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#This Notebook uses distilbert for getting relevant features from the parade dataset below.

The task here is tell if two given sentences are paraphrased. The column binary labels is 1 for being paraphrased and 0 for not.

I have approached using self-attention.

##Logic -
Although the features from distilbert are in some-way features but they cannot be directly used since it is not clear which part of them may be useful, hence self attention.
The self-attention head should be able to highlight features important for the task. All the classification work can then be taken care by the classification heads a.k.a some linear layers arranged sequentially.

In [None]:
!pip install transformers[torch]
!git clone https://github.com/heyunh2015/PARADE_dataset.git

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers[torch]
  Downloading transformers-4.19.4-py3-none-any.whl (4.2 MB)
[K     |████████████████████████████████| 4.2 MB 5.0 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.7.0-py3-none-any.whl (86 kB)
[K     |████████████████████████████████| 86 kB 6.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 65.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 32.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Unin

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import os
import torchvision
from PIL import Image
import numpy as np
import torchvision.transforms as transforms
from torch.utils.data.dataloader import DataLoader
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from torch.utils.data import random_split
from torch.utils.data import Dataset
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules.activation import Sigmoid
import torch.optim as optim
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
import pandas as pd

In [None]:
df_train = pd.read_csv("/content/PARADE_dataset/PARADE_train.txt",delimiter="\t")
df_test = pd.read_csv("/content/PARADE_dataset/PARADE_test.txt", delimiter="\t")
df_val = pd.read_csv("/content/PARADE_dataset/PARADE_validation.txt", delimiter="\t")

In [None]:
df_train = df_train.drop(columns=['Four-class labels', 'Entity'])
df_test = df_test.drop(columns=['Four-class labels', 'Entity'])
df_val = df_val.drop(columns=['Four-class labels', 'Entity'])

In [None]:
df_test.head()

Unnamed: 0,Binary labels,Definition1,Definition2
0,0,must be both relevant and accurate to achieve ...,a list of all external data needed for the use...
1,0,-any data that the program receives while it i...,the data values that are scanned by a program
2,1,vulnerability exists but wasn't detected by vu...,a security incident that isn't detected or rep...
3,0,vulnerability exists but wasn't detected by vu...,an error in which you are not alerted to a sit...
4,1,vulnerability exists but wasn't detected by vu...,term for when a scan fails to find real vulner...


In [None]:
combined_ = []

def combine_columns(x):
  combined_.append([x[1]+ "  " + x[2]])

df_train.apply(func=combine_columns, axis=1)
df_train = df_train.drop(columns=['Definition1', 'Definition2'])
df_train['combined'] = pd.DataFrame(combined_, columns =['combined'])

combined_ = []
df_test.apply(func=combine_columns, axis=1)
df_test = df_test.drop(columns=['Definition1', 'Definition2'])
df_test['combined'] =  pd.DataFrame(combined_, columns =['combined'])

combined_ = []
df_val.apply(func=combine_columns, axis=1)
df_val = df_val.drop(columns=['Definition1', 'Definition2'])
df_val['combined'] =  pd.DataFrame(combined_, columns =['combined'])

In [None]:
class textDataset(Dataset):
  def __init__(self, sent_dataframe):
    self.sentence_1 = sent_dataframe['combined'].to_list()
    self.labels_ = sent_dataframe['Binary labels'].to_list()
    self.tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

  def __len__(self):
    return len(self.labels_)
  
  def __getitem__(self, index):
    input_ids = []
    attention_mask = []

    sent_1 = self.sentence_1[index] 
    input_ids.append(self._sent_embedding(sent_1)['input_ids'])

    attention_mask.append(self._sent_embedding(sent_1)['attention_mask'])
    
    label = torch.tensor(self.labels_[index], dtype=torch.long)
    return input_ids, attention_mask, label

  def _sent_embedding(self,inp):
    encoded_input = self.tokenizer(inp, return_tensors='pt', padding ='max_length', max_length=259, truncation=True)
    return encoded_input

In [None]:
  train_set = textDataset(df_train)
  test_set = textDataset(df_test)
  val_set = textDataset(df_val)
  train_dl = DataLoader(train_set, batch_size=200)
  val_dl = DataLoader(val_set, batch_size=200)
  test_dl = DataLoader(test_set, batch_size=200)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [None]:
"""
Custom Model class 

"""
class CustomTransformer(nn.Module):
  def __init__(self, input_feat, train_distil=False):
    super().__init__()
    
    self._distilbert = DistilBertModel.from_pretrained("distilbert-base-uncased")
    self._distilbert.requires_grad_(False) # Distilbert not being trained
    self._multi_head_self_atttention = nn.MultiheadAttention(embed_dim=768,
                                                             num_heads=12,
                                                             batch_first=True,
                                                             dropout=0.139)
    self._key = nn.Linear(768, 768, bias=False)
    self._query = nn.Linear(768, 768, bias=False)
    self._value = nn.Linear(768, 768, bias=False)

    self._FC_1 = nn.Linear(768, 512)
    self._LeakyReLU = nn.LeakyReLU(negative_slope=0.01, inplace=False)
    self._FC_2 = nn.Linear(512, 256)
    self._FC_3 = nn.Linear(256, 2)


  def forward(self, x):
    out = self._find_sentence_embedding(x)

    _key =  self._key(out)
    _query =  self._query(out)
    _value =   self._value(out)

    attn_output , _ = self._multi_head_self_atttention(_key, _query, _value)

    out = torch.sum(attn_output, dim=1)

    out = F.normalize(out, dim=1)
    
    out = self._FC_1(out)
    out = self._LeakyReLU(out)
    out = self._FC_2(out)
    out = self._LeakyReLU(out)
    out = self._FC_3(out)
    out = F.softmax(out, dim=1)

    return out

  def _find_sentence_embedding(self, x):
    inp_id, attention_mas = x

    for i in range(len(inp_id)):
      inp_id[i] = inp_id[i].to(torch.device('cuda:0'))
      attention_mas[i] = attention_mas[i].to(torch.device('cuda:0'))

    sent_1 = self._distilbert(input_ids=inp_id[0].squeeze(1), attention_mask=attention_mas[0].squeeze(1))[0]
    sent=sent_1
    return sent


In [None]:
device = torch.device('cuda:0')

def loss_calc(y_true,y_pred):
  criterion = nn.CrossEntropyLoss()
  return criterion(y_pred, y_true)


@torch.no_grad()
def validate(vap_dl,model):
  loss = 0
  acc = 0
  f1_score_ = 0
  counter=0

  for batch in vap_dl:
    input_ids, attention_mask, act = batch
    val = act.numpy().flatten()
    act = act.to(device)
    out = model((input_ids, attention_mask)) #Prob output
    l = loss_calc(act, out)
    pred = torch.argmax(out, dim=1)
    pred = pred.cpu()
    pred = pred.numpy().flatten()
    acc+= accuracy_score(val ,pred)
    f1_score_+= f1_score(val, pred, average="weighted")
    loss += l.item()
    counter+=1
  
  return loss/counter, acc/counter, f1_score_/counter


def train(epochs,train_dl,val_dl,model, optimizer):
  
  hist = dict()
  hist["train"] = []
  hist["val"] = []
  model = model.double()
  model.to(device)
 
  for epch in range(epochs):
    for batch in train_dl:
      input_ids, attention_mask, act = batch
      
      act = act.to(device)
      
      optimizer.zero_grad()

      out = model((input_ids, attention_mask)) 
      loss=loss_calc(act, out)
      
      loss.backward() 

      optimizer.step()
      
    if epch % 2 == 0:
      loss_,acc_,f1_score_ =  validate(val_dl, model)
      print("Epoch number=====> {epoch}".format(epoch=epch))
      print("Val Loss is {loss_val:.3f}".format(loss_val=loss_))
      print("Val Accuracy is {acc:3.3f}".format(acc=acc_))
      print("Val F1 score is {f1_score_:3.3f}".format(f1_score_=f1_score_))
      print("========================================================")
      hist['val'].append([loss_,acc_,f1_score_])

      _loss_, _acc_, _f1_score_  = validate(train_dl, model)

      print("Train Loss is {loss_train:.3f}".format(loss_train=_loss_))
      print("Train Accuracy is {acc:3.3f}".format(acc=_acc_))
      print("Train F1 score is {f1_score_:3.3f}".format(f1_score_=_f1_score_))
      print("========================================================")
      
      hist['train'].append([_loss_,_acc_,_f1_score_ ])

  return hist

In [None]:
def plot_graph(hist):
  train = [i[0] for i in hist['train']]
  val = [i[0] for i in hist['val']]
  epochs = np.linspace(start=0,stop=15,num=8)
  plt.figure(figsize=(10,6))
  plt.plot(epochs,train,color = 'blue',label='Train LOSS VS EPOCHS' , linewidth=2, linestyle='dashed')
  plt.plot(epochs,val,color = 'red',label='Validation LOSS VS EPOCHS' ,linewidth=2, linestyle='dashed')
  plt.xlabel("EPOCHS")
  plt.ylabel("LOSS")
  plt.legend()
  plt.plot()

In [None]:
model = CustomTransformer(768)
optimizer = optim.AdamW(model.parameters(), lr=0.001)
hist_adam_W = train(15, train_dl, val_dl, model, optimizer)

plot_graph(hist_adam_W)

print("============================TEST=================================")
loss_, _acc_, _f1_score_ = validate(test_dl, model)
print("Test Loss is {loss_test:.3f}".format(loss_test=loss_))
print("Test Accuracy is {acc:3.3f}".format(acc=_acc_))
print("Test F1 score is {f1_score_:3.3f}".format(f1_score_=_f1_score_))