In [None]:
# import all necessary packages for CBOW
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import random
import os
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import collections
import itertools
import re
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from nltk.stem import WordNetLemmatizer
from scipy.stats import pearsonr, spearmanr
from torchtext.vocab import GloVe
from sklearn.model_selection import train_test_split
from gensim import matutils
from numpy import dot

In [2]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print device name: get_device_name()
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 2080 Ti


In [3]:
# Load GloVe embeddings
glove = GloVe(name='6B')
print(glove.vectors.shape)

torch.Size([400000, 300])


In [4]:
# Sample check
x = glove.vectors[glove.stoi['king']]
y = glove.vectors[glove.stoi['queen']]
# z = king - man + woman
z = x - glove.vectors[glove.stoi['man']] + glove.vectors[glove.stoi['woman']]
print("Distance between king and queen: ", torch.norm(x - y).item())
print("Cosine similarity between king and queen: ", F.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0)).item())
print("New Distance between king and queen: ", torch.norm(x - z).item())
print("New Cosine similarity between king and queen: ", F.cosine_similarity(x.unsqueeze(0), z.unsqueeze(0)).item())

Distance between king and queen:  5.966258525848389
Cosine similarity between king and queen:  0.6336469650268555
New Distance between king and queen:  4.753939628601074
New Cosine similarity between king and queen:  0.8065859079360962


In [5]:
# Check glove
# print(glove.vectors[glove.stoi['long']])
x = glove.vectors[glove.stoi['short']]
y = glove.vectors[glove.stoi['long']]
print("Distance for Short vs Long:", torch.norm(x - y))
print("Cosine similarity for Short vs Long:",torch.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0)))

x = glove.vectors[glove.stoi['smart']]
y = glove.vectors[glove.stoi['intelligent']]
print("Distance for Smart vs Intelligent:", torch.norm(x - y))
print("Cosine similarity for Smart vs Intelligent:",torch.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0)))

Distance for Short vs Long: tensor(4.4622)
Cosine similarity for Short vs Long: tensor([0.6962])
Distance for Smart vs Intelligent: tensor(5.0731)
Cosine similarity for Smart vs Intelligent: tensor([0.6520])


In [5]:
# Function to return glove embedding of a word
def get_word_embedding(word):
    return glove.vectors[glove.stoi[word] if word in glove.stoi else glove.stoi['unk']]

In [4]:
# Load test data

# Load into dataframe
df = pd.read_csv('./wordsim353/combined.csv')
print(df.head())

     Word 1    Word 2  Human (mean)
0      love       sex          6.77
1     tiger       cat          7.35
2     tiger     tiger         10.00
3      book     paper          7.46
4  computer  keyboard          7.62


In [9]:
# Get word embeddings
sample_embedding = get_word_embedding(df['Word 1'][1])
print(sample_embedding.shape)
sample_embedding = sample_embedding.squeeze()
print(sample_embedding.shape)
print(sample_embedding)

torch.Size([300])
torch.Size([300])
tensor([ 3.1805e-01,  3.8612e-01,  1.0725e-01,  2.8261e-01, -4.4965e-02,
         1.0612e-02,  4.3426e-01,  1.1006e+00,  1.5124e-01, -7.5199e-01,
         5.4254e-01, -2.5544e-01, -1.6400e-01,  1.6128e-01, -1.7060e-02,
        -2.2410e-01,  1.2682e-01,  8.4087e-01, -2.7631e-01,  4.4310e-02,
         2.6123e-01, -3.8948e-02, -1.4925e-01, -6.0481e-01, -1.1059e+00,
        -1.1135e-01, -5.9403e-02, -2.2909e-01,  6.7889e-01,  1.8288e-01,
         6.9610e-02, -1.3831e+00,  5.7360e-02, -3.3441e-01, -2.6577e-01,
        -3.4069e-01,  1.7086e-01,  5.9148e-01, -8.3631e-01,  4.8743e-01,
         2.4388e-01, -4.2785e-01,  3.9639e-01, -1.8224e-01, -3.1574e-01,
        -4.1929e-01,  4.3294e-01, -3.1500e-01, -2.3390e-01, -9.5833e-03,
         9.6671e-01, -1.8473e-01,  1.5179e-01,  3.5956e-01, -5.4430e-02,
         2.4032e-01, -1.7691e-02,  1.0346e+00, -2.3621e-01, -4.6284e-02,
        -6.3183e-01, -2.6131e-01,  2.2495e-01,  6.5933e-01,  9.7632e-02,
        -1.4428

In [7]:
# Check similarity between two words
word1 = df['Word 1'][1]
word2 = df['Word 1'][1]
# Use gensim matutils to calculate cosine similarity
w1 = get_word_embedding(word1)
w2 = get_word_embedding(word2)
# Convert to numpy array
w1 = w1.numpy()
w2 = w2.numpy()
print(type(w1))

sim = dot(matutils.unitvec(w1), matutils.unitvec(w2))
print(sim)

<class 'numpy.ndarray'>
1.0


In [8]:
# Function to get cosine similarity
def cos_similarity(word1_embedding, word2_embedding):
    word1_embedding = np.array(word1_embedding)
    word2_embedding = np.array(word2_embedding)
   
    ans = dot(matutils.unitvec(word1_embedding), matutils.unitvec(word2_embedding))
    return ans

# Function to get Pearson correlation
def pearson_correlation(word1_embedding, word2_embedding):
    emb1 = np.array(word1_embedding)
    emb2 = np.array(word2_embedding)

    correlation, _ = pearsonr(emb1, emb2)
    return correlation

In [9]:
def test_sim(df, lemmatizer, stemmer):
    cosine_similarity_scores = []
    pearson_correlation_scores = []
    scores = []

    for _, row in df.iterrows():
        word1 = row['Word 1']
        word2 = row['Word 2']
        
        # Get embeddings
        word1_embedding = get_word_embedding(word1).squeeze()
        word2_embedding = get_word_embedding(word2).squeeze()

        # Get cosine similarity
        cosine_similarity_scores.append(cos_similarity(word1_embedding, word2_embedding))
        
        # Get pearson correlation
        pearson_correlation_scores.append(pearson_correlation(word1_embedding, word2_embedding))

        # Get score
        scores.append(row['Human (mean)'])
        
    return cosine_similarity_scores, pearson_correlation_scores, scores

In [10]:
# Get cosine similarity and pearson correlation scores
lemmatizer = WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
cosine_similarity_scores, pearson_correlation_scores, scores = test_sim(df, lemmatizer, stemmer)

In [11]:
# Check cosine similarity and pearson correlation scores
print(type(cosine_similarity_scores))
print(type(pearson_correlation_scores))
print(type(scores))

<class 'list'>
<class 'list'>
<class 'list'>


### Initial Spearman

In [12]:
# Funtcion to get spearman correlation using cosine similarity scores
def spearman_correlation(cosine_similarity_scores, simlex_scores):
    # Scale cosine similarity scores to 0-10
    cosine_similarity_scores = np.array(cosine_similarity_scores)
    cosine_similarity_scores = (1+cosine_similarity_scores)*5
    simlex_scores = np.array(simlex_scores)

    correlation, _ = spearmanr(cosine_similarity_scores, simlex_scores)
    return correlation    

In [13]:
# Print the initial spearman correlation
spearman_value_sim = spearman_correlation(cosine_similarity_scores, scores)
print("Initial Spearman correlation: ", spearman_value_sim)

Initial Spearman correlation:  0.5433753781774955
