In [10]:
from absl import logging

import tensorflow as tf

import tensorflow_hub as hub
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import re
import seaborn as sns



In [11]:
data = pd.read_csv("mrpc_syn.csv")

In [12]:
data.head()

Unnamed: 0.1,Unnamed: 0,Sentence,Syn_Sentence-1,Syn_Sentence-2,Syn_Sentence-3
0,3645,""" The anticipated global sales improvement in ...",""" The anticipated global cut rate sale improve...",""" The anticipated orbicular sales improvement ...",""" The anticipated global sales improvement in ..."
1,1068,"In Washington on Sunday , FBI spokesman John I...","In Washington on william ashley sunday , FBI s...","In booker t washington on Sunday , FBI spokesm...","inch Washington on Sunday , FBI spokesman st j..."
2,2484,The unusual decision to declassify an intellig...,The unusual conclusion to declassify an intell...,The unusual decision to declassify an intellig...,The unusual decision to declassify an intellig...
3,8,"That compared with $ 35.18 million , or 24 cen...","That compared with $ 35.18 million , or 24 cen...","That equate with $ 35.18 million , or 24 cents...","That compared with $ 35.18 billion , or cents..."
4,3092,""" We acted because we saw the evidence in a dr...",""" We acted because we saw the evidence in a dr...",""" We acted because we run into the evidence in...",""" We acted because we control the bear witness..."


In [13]:
sent= data.Sentence
sent1 = data["Syn_Sentence-1"]
sent2 = data["Syn_Sentence-2"]
sent3 = data["Syn_Sentence-3"]


In [14]:
def cos_sim(a, b):
	"""Takes 2 vectors a, b and returns the cosine similarity according 
	to the definition of the dot product
	"""
	dot_product = np.dot(a, b)
	norm_a = np.linalg.norm(a)
	norm_b = np.linalg.norm(b)
	return dot_product / (norm_a * norm_b)

# USE

In [12]:
import tensorflow_hub as hub
module_url = "https://tfhub.dev/google/universal-sentence-encoder/4"
model = hub.load(module_url)
print ("module %s loaded" % module_url)
def embed(input):
  return model(input)

module https://tfhub.dev/google/universal-sentence-encoder/4 loaded


In [13]:
# # positive data
# embed_p1 = embed(data.sentence1.sample(1000))
# embed_p2 = embed(data.sentence2.sample(1000))

# random sample data
embed_sent = embed(sent)
embed_p1 = embed(sent1)
embed_p2 = embed(sent2)
embed_p3 = embed(sent3)

In [14]:
embed_p1.shape

TensorShape([3500, 512])

In [18]:
data["Sent-Syn_Sent-1_Sim"] = [cos_sim(embed_sent[i],embed_p1[i]) for i in range(len(embed_sent))]
data["Sent-Syn_Sent-2_Sim"] = [cos_sim(embed_sent[i],embed_p2[i]) for i in range(len(embed_sent))]
data["Sent-Syn_Sent-3_Sim"] = [cos_sim(embed_sent[i],embed_p3[i]) for i in range(len(embed_sent))]

data["Sent-Syn_Sent-1_Sim"].mean(),data["Sent-Syn_Sent-2_Sim"].mean(),data["Sent-Syn_Sent-3_Sim"].mean()

(0.9496018886566162, 0.900104820728302, 0.850010097026825)

# SentBERT

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
model_sbert = SentenceTransformer('paraphrase-MiniLM-L6-v2')


In [23]:
sbert_sent_embed = [model_sbert.encode(i) for i in sent]
sbert_sent1_embed = [model_sbert.encode(i) for i in sent1]
sbert_sent2_embed = [model_sbert.encode(i) for i in sent2]
sbert_sent3_embed = [model_sbert.encode(i) for i in sent3]    

In [25]:
sent_sent1 = [cos_sim(sbert_sent_embed[i],sbert_sent1_embed[i]) for i in range(len(sbert_sent_embed))]
sent_sent2 = [cos_sim(sbert_sent_embed[i],sbert_sent2_embed[i]) for i in range(len(sbert_sent_embed))]
sent_sent3 = [cos_sim(sbert_sent_embed[i],sbert_sent3_embed[i]) for i in range(len(sbert_sent_embed))]

np.mean(sent_sent1),np.mean(sent_sent2),np.mean(sent_sent3)

(0.9488282, 0.8971737, 0.84642446)

# LASER

In [None]:
!pip install laserembeddings

In [33]:
from laserembeddings import Laser


In [34]:
!python -m laserembeddings download-models

Downloading models into /usr/local/lib/python3.7/dist-packages/laserembeddings/data

✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fcodes    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/93langs.fvocab    
✅   Downloaded https://dl.fbaipublicfiles.com/laser/models/bilstm.93langs.2018-12-26.pt    

✨ You're all set!


In [35]:
laser = Laser()

In [36]:
laser_embed_sent = laser.embed_sentences(sent,lang='en')
laser_embed_sent1 = laser.embed_sentences(sent1,lang='en')
laser_embed_sent2 = laser.embed_sentences(sent2,lang='en')
laser_embed_sent3 = laser.embed_sentences(sent3,lang='en')


In [38]:
laser_sent_sent1 = [cos_sim(laser_embed_sent[i],laser_embed_sent1[i]) for i in range(len(laser_embed_sent))]
laser_sent_sent2 = [cos_sim(laser_embed_sent[i],laser_embed_sent2[i]) for i in range(len(laser_embed_sent))]
laser_sent_sent3 = [cos_sim(laser_embed_sent[i],laser_embed_sent3[i]) for i in range(len(laser_embed_sent))]

np.mean(laser_sent_sent1),np.mean(laser_sent_sent2),np.mean(laser_sent_sent3)

(0.9803443, 0.96107775, 0.9443284)

# DOC2VEC

In [18]:
from gensim.models.doc2vec import Doc2Vec,TaggedDocument
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [19]:
tagged_sent = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(sent)]
tagged_sent1 = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(sent1)]
tagged_sent2 = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(sent2)]
tagged_sent3 = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(sent3)]

# tagged_data_n1 = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(negative_data.sentence1)]
# tagged_data_n2 = [TaggedDocument(words=word_tokenize(_d.lower()), tags=[str(i)]) for i, _d in enumerate(negative_data.sentence2)]

In [20]:
model_sent = Doc2Vec(vector_size=20,window=5,min_count=1,workers=2,dm=1,epochs=20)
model_sent.build_vocab(tagged_sent)

model_sent1 = Doc2Vec(vector_size=20,window=5,min_count=1,workers=2,dm=1,epochs=20)
model_sent1.build_vocab(tagged_sent1)

model_sent2 = Doc2Vec(vector_size=20,window=5,min_count=1,workers=2,dm=1,epochs=20)
model_sent2.build_vocab(tagged_sent2)

model_sent3 = Doc2Vec(vector_size=20,window=5,min_count=1,workers=2,dm=1,epochs=20)
model_sent3.build_vocab(tagged_sent3)


In [21]:
for epoch in range(20):
    model_sent.train(tagged_sent,epochs=20,total_examples=model_sent.corpus_count)
    model_sent1.train(tagged_sent1,epochs=20,total_examples=model_sent1.corpus_count)
    model_sent2.train(tagged_sent2,epochs=20,total_examples=model_sent2.corpus_count)
    model_sent3.train(tagged_sent3,epochs=20,total_examples=model_sent3.corpus_count)   
    # print("Epoch #{} is complete.".format(epoch+1))
print("Done")

Done


In [28]:
doc_sent_sent1=[cos_sim(model_sent.docvecs[i],model_sent1.docvecs[i]) for i in range(len(model_sent.docvecs[0]))]
doc_sent_sent2=[cos_sim(model_sent.docvecs[i],model_sent2.docvecs[i]) for i in range(len(model_sent.docvecs[0]))]
doc_sent_sent3=[cos_sim(model_sent.docvecs[i],model_sent3.docvecs[i]) for i in range(len(model_sent.docvecs[0]))]

print(np.mean(doc_sent_sent1),np.mean(doc_sent_sent2),np.mean(doc_sent_sent3))


0.7950778 0.6710359 0.5840896


# InferSent

In [1]:
# import stuff
%load_ext autoreload
%autoreload 2
%matplotlib inline

from random import randint

import numpy as np
import torch

In [2]:
!mkdir GloVe
!curl -Lo GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip GloVe/glove.840B.300d.zip -d GloVe/
#!mkdir fastText
#!curl -Lo fastText/crawl-300d-2M.vec.zip https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
#!unzip fastText/crawl-300d-2M.vec.zip -d fastText/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0   315    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0   352    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 2075M  100 2075M    0     0  5139k      0  0:06:53  0:06:53 --:--:-- 5518k
Archive:  GloVe/glove.840B.300d.zip
  inflating: GloVe/glove.840B.300d.txt  


In [3]:
!mkdir encoder
!curl -Lo encoder/infersent1.pkl https://dl.fbaipublicfiles.com/infersent/infersent1.pkl
!curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  8165k      0  0:00:18  0:00:18 --:--:-- 8195k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  12.5M      0  0:00:11  0:00:11 --:--:-- 14.6M


In [4]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [5]:
# Load model
from models import InferSent
model_version = 1
MODEL_PATH = "encoder/infersent%s.pkl" % model_version
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': model_version}
model = InferSent(params_model)
model.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [6]:
# Keep it on CPU or put it on GPU
use_cuda = False
model = model.cuda() if use_cuda else model

In [7]:
# If infersent1 -> use GloVe embeddings. If infersent2 -> use InferSent embeddings.
W2V_PATH = 'GloVe/glove.840B.300d.txt' if model_version == 1 else 'fastText/crawl-300d-2M.vec'
model.set_w2v_path(W2V_PATH)

In [8]:
# Load embeddings of K most frequent words
model.build_vocab_k_words(K=100000)

Vocab size : 100000


In [15]:
infer_embed_sent=model.encode(sent,tokenize = True)
infer_embed_sent1=model.encode(sent1,tokenize = True)
infer_embed_sent2=model.encode(sent2,tokenize = True)
infer_embed_sent3=model.encode(sent3,tokenize = True)



  sentences = np.array(sentences)[idx_sort]


In [16]:
infer_sent_sent1=[cos_sim(infer_embed_sent[i],infer_embed_sent1[i]) for i in range(len(infer_embed_sent))]
infer_sent_sent2=[cos_sim(infer_embed_sent[i],infer_embed_sent2[i]) for i in range(len(infer_embed_sent))]
infer_sent_sent3=[cos_sim(infer_embed_sent[i],infer_embed_sent3[i]) for i in range(len(infer_embed_sent))]



In [29]:
print(np.mean(infer_sent_sent1),np.mean(infer_sent_sent2),np.mean(infer_sent_sent3))

0.9755389 0.95556974 0.9357833
