Installing transformers library

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/19/22/aff234f4a841f8999e68a7a94bdd4b60b4cebcfeca5d67d61cd08c9179de/transformers-3.3.1-py3-none-any.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 4.7MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 23.0MB/s 
Collecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 31.7MB/s 
Collecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB

Importing dependencies

In [None]:
import torch
import random
import numpy as np
import pandas as pd
import torch.nn as nn
import torch.optim as optim
from tqdm.notebook import tqdm
import torch.nn.functional as F
from torchvision import transforms
from transformers import AutoTokenizer, AutoModel

Network class defines the architecture and forward propagation of the model used to fine tune and create meaninful sentence embeddings from the pretrained word embeddings model, using PyTorch.

In [None]:
class Network(nn.Module):
  
  def __init__(self, input_size, hidden_size, num_layers):
    
    super(Network, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first = True, bidirectional = True)
    self.fc = nn.Linear(hidden_size*2, hidden_size*2)
        
  def forward_once(self, x, length):

    x = nn.utils.rnn.pack_padded_sequence(x, length, batch_first=True)
    x, (hidden, cell) = self.lstm(x)
    x = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim = 1)
    x = self.fc(x)
    return x
    
  def forward(self, input1, length1, input2, length2):

    output1 = self.forward_once(input1, length1)
    output2 = self.forward_once(input2, length2)
    return output1, output2

Checking for availability of GPU

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Loading the pretrained word embeddings model.

Link to pretrained model: https://huggingface.co/gsarti/biobert-nli?text=The+goal+of+life+is+%5BMASK%5D.

In [None]:
tokenizer = AutoTokenizer.from_pretrained("gsarti/biobert-nli")
model = AutoModel.from_pretrained("gsarti/biobert-nli",output_hidden_states=True)
model.to(device)
model.eval()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=454.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=136.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433288887.0, style=ProgressStyle(descri…




BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(28996, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

Setting the parameters of the Network class

In [None]:
input_size = 768
sequence_length = 512
num_layers = 2
hidden_size = 768

Loading the trained fine tuning model to extract sentence embeddings.

In [None]:
SimilarityModel = Network(input_size, hidden_size, num_layers)
SimilarityModel.load_state_dict(torch.load("drive/My Drive/SimilarityModel.pt")) #Replace path with path of trained model file
SimilarityModel = SimilarityModel.to(device)
SimilarityModel.eval()

Network(
  (lstm): LSTM(768, 768, num_layers=2, batch_first=True, bidirectional=True)
  (fc): Linear(in_features=1536, out_features=1536, bias=True)
)

Loading the database of Question Answer pairs, and creating a list of questions and answers separately.

In [None]:
db = pd.read_csv("db.csv", encoding="cp1252")#Replace path with bath of data file(csv, having 2 columns: Question and Answer respectively) 
db.columns = ["Question", "Answer"]
question_list = []
answer_list = []
for index, row in db.iterrows():
  question_list.append(row["Question"])
  answer_list.append(row["Answer"])

Utility function that returns the word embeddings, for a string passed as input, extracted from the pretrained word embeddings model.

In [None]:
def get_word_embeddings(Question):

  sentence_token_ids = tokenizer.encode(Question)
  sentence_token_ids = torch.LongTensor(sentence_token_ids).unsqueeze(0)
  sentence_token_ids = sentence_token_ids.to(device)
  with torch.no_grad():
    word_embeddings = (model(input_ids=sentence_token_ids)[2])[-1]
  return word_embeddings

Utility function that calls the get_word_embeddings function, generates sentence embeddings for both input strings (Q1 and Q2) by passing the word embeddings for each of them to the similarity model(defined in Network class) and finally returns the pairwise eucledian distance between the generated sentence embeddings.

In [None]:
def get_difference(Q1, Q2):
  
  embedding_Q1 = (get_word_embeddings(Q1)).to(device)
  embedding_Q2 = (get_word_embeddings(Q2)).to(device)
  embedding_Q1, embedding_Q2 = SimilarityModel(embedding_Q1, torch.tensor([embedding_Q1.shape[1]]), embedding_Q2, torch.tensor([embedding_Q2.shape[1]]))
  difference = F.pairwise_distance(embedding_Q1, embedding_Q2)
  return difference

Takes as input the new query and returns the confidence level(0/1), the question with the closest semantic relationship(according to the similarity model) with the input query(from the database) and the answer to the question(from the database).

In [None]:
def get_best_match(Question):
  differences = []
  for question in question_list:
    differences.append(get_difference(question, Question))
  min_index = differences.index(min(differences))
  if differences[min_index] > 0.8 :
    confidence = 0
  else :
    confidence = 1
  return confidence, question_list[min_index], answer_list[min_index]

Driver code to demostrate working of the model.

In [None]:
confidence, question, answer = get_best_match("What are the symptoms of corona virus?")
print(confidence)
print(question)
print(answer)

1
How to check if I have corona?
a
