# Semantic Similarity Analysis - Zoe (Infersent)

## Import libraries

In [113]:
import numpy as np
import pandas as pd
import csv

from models import InferSent
import torch

import nltk
nltk.download('punkt')

from scipy.spatial import distance

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Download Infersent2 model and GloVe word vectors

In [1]:
# ! mkdir encoder
# ! curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl
  
# ! mkdir GloVe
# ! curl -Lo GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
# ! unzip GloVe/glove.840B.300d.zip -d GloVe/

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  19.1M      0  0:00:07  0:00:07 --:--:-- 23.1M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0   315    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0   352    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 2075M  100 2075M    0     0  2092k      0  0:16:55  0:16:55 --:--:-- 1900k
Archive:  GloVe/glove.840B.300d.zip
  inflating: GloVe/glove.840B.300d.txt  


## Set up file paths

In [57]:
cur_dir = 'drive/My Drive/Colab Notebooks/2 Fall 2020/CMPE 255/Project/'
path_train = cur_dir + 'sts-train.csv'
path_test = cur_dir + 'sts-test.csv'

## Read the training data

In [62]:
train_df = pd.read_table(
    path_train,
    error_bad_lines=False,
    skip_blank_lines=True,
    quoting=csv.QUOTE_NONE,
    usecols=[4, 5, 6],
    names=["similarity", "s1", "s2"])

train_df

Unnamed: 0,similarity,s1,s2
0,5.00,A plane is taking off.,An air plane is taking off.
1,3.80,A man is playing a large flute.,A man is playing a flute.
2,3.80,A man is spreading shreded cheese on a pizza.,A man is spreading shredded cheese on an uncoo...
3,2.60,Three men are playing chess.,Two men are playing chess.
4,4.25,A man is playing the cello.,A man seated is playing the cello.
...,...,...,...
5744,0.00,Severe Gales As Storm Clodagh Hits Britain,Merkel pledges NATO solidarity with Latvia
5745,0.00,Dozens of Egyptians hostages taken by Libyan t...,Egyptian boat crash death toll rises as more b...
5746,0.00,President heading to Bahrain,President Xi: China to continue help to fight ...
5747,0.00,"China, India vow to further bilateral ties",China Scrambles to Reassure Jittery Stock Traders


## Load Infersent model and word embeddings

In [56]:
V = 2
model_path = 'encoder/infersent%s.pkl' % V
model_params = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
model = InferSent(model_params)
model.load_state_dict(torch.load(model_path))

w2v_path = '/content/GloVe/glove.840B.300d.txt'
model.set_w2v_path(w2v_path)

## Create sentences list from the dataset for vocab

In [73]:
sentences = []
for row in train_df.iterrows():
    sentences.append(row[1][1])
    sentences.append(row[1][2])

print(len(sentences))

11498


## Build vocab for Infersent model from sentences list

In [74]:
model.build_vocab(sentences)

Found 13740(/14417) words with w2v vectors
Vocab size : 13740


## Compare score from Infersent model vs. score from the dataset

In [126]:
index = 0
s1 = train_df.loc[index][1]
s2 = train_df.loc[index][2]
e1 = model.encode([s1])[0]
e2 = model.encode([s2])[0]
print('1:', s1)
print('2:', s2)
print()

print('cosine similarity for embeddings', round(distance.cosine(e1, e2), 6))
print('similarity score from dataset', train_df.loc[index][0])
print('similarity score from Infersent model', round((1 - distance.cosine(e1,e2))*5, 2))

1: A plane is taking off.
2: An air plane is taking off.

cosine similarity for embeddings 0.091629
similarity score from dataset 5.0
similarity score from Infersent model 4.54


## Store scores from the dataset

In [93]:
actual_scores = [s for s in train_df['similarity'].tolist()]
print(len(actual_scores))
print(actual_scores[1])

5749
3.8


## Calculate and store scores from Infersent model

In [95]:
infersent_scores = []
for row in train_df.iterrows():
    s1 = row[1][1]
    s2 = row[1][2]
    e1 = model.encode([s1])[0]
    e2 = model.encode([s2])[0]
    score = round((1 - distance.cosine(e1,e2))*5, 2)
    infersent_scores.append(score)
    # if len(infersent_scores) % 100 == 0:
    #     print(len(infersent_scores))

print(len(infersent_scores))
print(infersent_scores[1])

5749
4.85


## Analysis
Use pearson correlation to compare actual and calculated scores

In [124]:
c = pd.Series(actual_scores).corr(pd.Series(infersent_scores))
print('Pearson correlation: %.2f' % (c*100))

Pearson correlation: 52.93


## Try using Infersent with different vocab words

### Load Infersent model and word embeddings

In [129]:
V = 2
model_path = 'encoder/infersent%s.pkl' % V
model_params = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
model2 = InferSent(model_params)
model2.load_state_dict(torch.load(model_path))

w2v_path = '/content/GloVe/glove.840B.300d.txt'
model2.set_w2v_path(w2v_path)

### Build vocab for Infersent model from 1 million most common English words

In [131]:
model2.build_vocab_k_words(K=1000000) # 1 million most common English words

Vocab size : 1000000


### Compare score from Infersent model vs. score from the dataset

In [132]:
index = 0
s1 = train_df.loc[index][1]
s2 = train_df.loc[index][2]
e1 = model2.encode([s1])[0]
e2 = model2.encode([s2])[0]
print('1:', s1)
print('2:', s2)
print()

print('cosine similarity for embeddings', round(distance.cosine(e1, e2), 6))
print('similarity score from dataset', train_df.loc[index][0])
print('similarity score from Infersent model', round((1 - distance.cosine(e1,e2))*5, 2))

1: A plane is taking off.
2: An air plane is taking off.

cosine similarity for embeddings 0.091629
similarity score from dataset 5.0
similarity score from Infersent model 4.54


### Store scores from the dataset

In [133]:
actual_scores = [s for s in train_df['similarity'].tolist()]
print(len(actual_scores))
print(actual_scores[1])

5749
3.8


### Calculate and store scores from Infersent model

In [135]:
infersent_scores2 = []
for row in train_df.iterrows():
    s1 = row[1][1]
    s2 = row[1][2]
    e1 = model2.encode([s1])[0]
    e2 = model2.encode([s2])[0]
    score = round((1 - distance.cosine(e1,e2))*5, 2)
    infersent_scores2.append(score)
    if len(infersent_scores2) % 100 == 0:
        print(len(infersent_scores2))

print(len(infersent_scores2))
print(infersent_scores2[1])

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5749
4.85


### Analysis
Use pearson correlation to compare actual and calculated scores

In [136]:
c = pd.Series(actual_scores).corr(pd.Series(infersent_scores2))
print('Pearson correlation: %.2f' % (c*100))

Pearson correlation: 52.99
