In [None]:
# import all necessary packages for CBOW
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import random
import os
import matplotlib.pyplot as plt
import pickle
from tqdm import tqdm
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import collections
import itertools
import re
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
from nltk.stem import WordNetLemmatizer
from scipy.stats import pearsonr, spearmanr
from torchtext.vocab import GloVe
from sklearn.model_selection import train_test_split
from gensim import matutils
from numpy import dot

In [2]:
# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print device name: get_device_name()
print(torch.cuda.get_device_name(0))

NVIDIA GeForce RTX 2080 Ti


In [2]:
# Load GloVe embeddings
glove = GloVe(name='6B')
print(glove.vectors.shape)

torch.Size([400000, 300])


In [4]:
# Sample check
x = glove.vectors[glove.stoi['king']]
y = glove.vectors[glove.stoi['queen']]
# z = king - man + woman
z = x - glove.vectors[glove.stoi['man']] + glove.vectors[glove.stoi['woman']]
print("Distance between king and queen: ", torch.norm(x - y).item())
print("Cosine similarity between king and queen: ", F.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0)).item())
print("New Distance between king and queen: ", torch.norm(x - z).item())
print("New Cosine similarity between king and queen: ", F.cosine_similarity(x.unsqueeze(0), z.unsqueeze(0)).item())

Distance between king and queen:  5.966258525848389
Cosine similarity between king and queen:  0.6336469650268555
New Distance between king and queen:  4.753939628601074
New Cosine similarity between king and queen:  0.8065859079360962


In [5]:
# Check glove
# print(glove.vectors[glove.stoi['long']])
x = glove.vectors[glove.stoi['short']]
y = glove.vectors[glove.stoi['long']]
print("Distance for Short vs Long:", torch.norm(x - y))
print("Cosine similarity for Short vs Long:",torch.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0)))

x = glove.vectors[glove.stoi['smart']]
y = glove.vectors[glove.stoi['intelligent']]
print("Distance for Smart vs Intelligent:", torch.norm(x - y))
print("Cosine similarity for Smart vs Intelligent:",torch.cosine_similarity(x.unsqueeze(0), y.unsqueeze(0)))

Distance for Short vs Long: tensor(4.4622)
Cosine similarity for Short vs Long: tensor([0.6962])
Distance for Smart vs Intelligent: tensor(5.0731)
Cosine similarity for Smart vs Intelligent: tensor([0.6520])


In [4]:
# Function to return glove embedding of a word
def get_word_embedding(word):
    return glove.vectors[glove.stoi[word] if word in glove.stoi else glove.stoi['unk']]

In [5]:
# Load test data

# Load into dataframe
df = pd.read_csv('./SimLex-999/SimLex-999.txt', sep='\t')
print(df.head())

   word1        word2 POS  SimLex999  conc(w1)  conc(w2)  concQ  Assoc(USF)  \
0    old          new   A       1.58      2.72      2.81      2        7.25   
1  smart  intelligent   A       9.20      1.75      2.46      1        7.11   
2   hard    difficult   A       8.77      3.76      2.21      2        5.94   
3  happy     cheerful   A       9.55      2.56      2.34      1        5.85   
4   hard         easy   A       0.95      3.76      2.07      2        5.82   

   SimAssoc333  SD(SimLex)  
0            1        0.41  
1            1        0.67  
2            1        1.19  
3            1        2.18  
4            1        0.93  


In [8]:
# Get word embeddings
sample_embedding = get_word_embedding(df['word1'][1])
print(sample_embedding.shape)
sample_embedding = sample_embedding.squeeze()
print(sample_embedding.shape)
print(sample_embedding)

torch.Size([300])
torch.Size([300])
tensor([ 0.0785, -0.1332, -0.0080, -0.6235,  0.4710,  0.4107,  0.4210,  0.0975,
        -0.0953, -1.5386,  0.7887, -0.1068,  0.4298,  0.0091,  0.2274,  0.4928,
        -0.2278, -0.2156,  0.5345, -0.0212,  0.5884,  0.6723,  0.1577,  0.2165,
         0.0198, -0.1534,  0.0908,  0.4564,  0.4809,  0.1248, -0.2552,  0.4312,
        -0.4340,  0.5759, -0.9858,  0.6472, -0.1101, -0.1227,  0.3032, -0.1306,
        -0.0108,  0.1437, -0.0154,  0.2334, -0.0767, -0.4027,  0.1049, -0.4329,
         0.0226, -0.3252, -0.0767,  0.3358, -0.0463, -0.0706, -0.1404, -0.1653,
        -0.3133,  0.2234,  0.0640, -0.2646, -0.0804, -0.0948, -0.2048, -0.0995,
         0.0983,  0.5361,  0.0348,  0.3095, -0.4390, -0.1759, -0.2365, -0.2690,
         0.4018,  0.2489,  0.0699,  0.1214, -0.0119,  0.1006,  0.4084, -0.5842,
        -0.1472, -0.2617,  0.4712,  0.0417, -0.0841,  0.0027, -0.5950,  0.1395,
         0.0989, -0.1532, -0.3652,  0.6814,  0.0942, -0.3264,  0.6242, -0.1298,
    

In [8]:
# Check similarity between two words
word1 = df['word1'][1]
word2 = df['word1'][1]
# Use gensim matutils to calculate cosine similarity
w1 = get_word_embedding(word1)
w2 = get_word_embedding(word2)
# Convert to numpy array
w1 = w1.numpy()
w2 = w2.numpy()
print(type(w1))

sim = dot(matutils.unitvec(w1), matutils.unitvec(w2))
print(sim)

<class 'numpy.ndarray'>
1.0


In [5]:
# Check similarity between two words
word1 = 'big'
word2 = 'large'
# Use gensim matutils to calculate cosine similarity
w1 = get_word_embedding(word1)
w2 = get_word_embedding(word2)
# Convert to numpy array
w1 = w1.numpy()
w2 = w2.numpy()
print(type(w1))

sim = dot(matutils.unitvec(w1), matutils.unitvec(w2))
print(sim)

<class 'numpy.ndarray'>
0.58719975


In [9]:
# Function to get cosine similarity
def cos_similarity(word1_embedding, word2_embedding):
    word1_embedding = np.array(word1_embedding)
    word2_embedding = np.array(word2_embedding)
   
    ans = dot(matutils.unitvec(word1_embedding), matutils.unitvec(word2_embedding))
    return ans

# Function to get Pearson correlation
def pearson_correlation(word1_embedding, word2_embedding):
    emb1 = np.array(word1_embedding)
    emb2 = np.array(word2_embedding)

    correlation, _ = pearsonr(emb1, emb2)
    return correlation

In [10]:
def test_sim(df, lemmatizer, stemmer):
    cosine_similarity_scores = []
    pearson_correlation_scores = []
    simlex_scores = []
    assoc_scores = []
    for _, row in df.iterrows():
        word1 = row['word1']
        word2 = row['word2']
        form = row['POS']
        form = form.lower()
        
        word1_embedding = get_word_embedding(word1).squeeze()
        word2_embedding = get_word_embedding(word2).squeeze()

        # Get cosine similarity
        cosine_similarity_scores.append(cos_similarity(word1_embedding, word2_embedding))
        
        # Get pearson correlation
        pearson_correlation_scores.append(pearson_correlation(word1_embedding, word2_embedding))

        # Get simlex score
        simlex_scores.append(row['SimLex999'])

        # Get assoc score
        assoc_scores.append(row['Assoc(USF)'])
        
    return cosine_similarity_scores, pearson_correlation_scores, simlex_scores, assoc_scores

In [11]:
# Get cosine similarity and pearson correlation scores
lemmatizer = WordNetLemmatizer()
stemmer = nltk.stem.PorterStemmer()
cosine_similarity_scores, pearson_correlation_scores, simlex_scores, assoc_scores = test_sim(df, lemmatizer, stemmer)

In [12]:
# Check cosine similarity and pearson correlation scores
print(type(cosine_similarity_scores))
print(type(pearson_correlation_scores))
print(type(simlex_scores))
print(type(assoc_scores))

<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>


### Initial Spearman

In [13]:
# Funtcion to get spearman correlation using cosine similarity scores
def spearman_correlation(cosine_similarity_scores, simlex_scores):
    # Scale cosine similarity scores to 0-10
    cosine_similarity_scores = np.array(cosine_similarity_scores)
    cosine_similarity_scores = (1+cosine_similarity_scores)*5
    simlex_scores = np.array(simlex_scores)

    correlation, _ = spearmanr(cosine_similarity_scores, simlex_scores)
    return correlation    

In [14]:
# Print the initial spearman correlation
spearman_value_sim = spearman_correlation(cosine_similarity_scores, simlex_scores)
spearman_value_assoc = spearman_correlation(cosine_similarity_scores, assoc_scores)
print("Initial Spearman correlation Sim: ", spearman_value_sim)
print("Initial Spearman correlation Assoc: ", spearman_value_assoc)

Initial Spearman correlation Sim:  0.37050035710869067
Initial Spearman correlation Assoc:  0.38807153931964644


In [15]:
# Make a dataframe of cosine similarity scores and pearson correlation scores along with Simlex-999 scores and Assoc(USF)
simlex_scores = df['SimLex999']
assoc_scores = df['Assoc(USF)']
cosine_similarity_scores = np.array(cosine_similarity_scores)
pearson_correlation_scores = np.array(pearson_correlation_scores)
simlex_scores = np.array(simlex_scores)
assoc_scores = np.array(assoc_scores)
# print(cosine_similarity_scores.shape)
# print(pearson_correlation_scores.shape)

# Make a dataframe along with word1, word2, POS, SimLex-999 scores, Assoc(USF), cosine similarity scores and pearson correlation scores
datat = {'word1': df['word1'], 'word2': df['word2'], 'POS': df['POS'], 'SimLex999': simlex_scores, 'Assoc(USF)': assoc_scores, 'Cosine Similarity': cosine_similarity_scores, 'Pearson Correlation': pearson_correlation_scores}
ndf = pd.DataFrame(data=datat)
print(ndf.head())

   word1        word2 POS  SimLex999  Assoc(USF)  Cosine Similarity  \
0    old          new   A       1.58        7.25           0.367693   
1  smart  intelligent   A       9.20        7.11           0.652035   
2   hard    difficult   A       8.77        5.94           0.635781   
3  happy     cheerful   A       9.55        5.85           0.440317   
4   hard         easy   A       0.95        5.82           0.578426   

   Pearson Correlation  
0             0.368144  
1             0.652400  
2             0.636245  
3             0.441698  
4             0.579278  


In [16]:
# Print df
print(ndf)

      word1        word2 POS  SimLex999  Assoc(USF)  Cosine Similarity  \
0       old          new   A       1.58        7.25           0.367693   
1     smart  intelligent   A       9.20        7.11           0.652035   
2      hard    difficult   A       8.77        5.94           0.635781   
3     happy     cheerful   A       9.55        5.85           0.440317   
4      hard         easy   A       0.95        5.82           0.578426   
..      ...          ...  ..        ...         ...                ...   
994    join      acquire   V       2.85        0.00           0.285346   
995    send       attend   V       1.67        0.00           0.341219   
996  gather       attend   V       4.80        0.00           0.406434   
997  absorb     withdraw   V       2.97        0.00           0.155071   
998  attend       arrive   V       6.08        0.00           0.441711   

     Pearson Correlation  
0               0.368144  
1               0.652400  
2               0.636245  
3  

### Build the model

In [17]:
# Create Dataset
def create_dataset(df):
    # Create a list of tuples
    emb1 = []
    emb2 = []
    simlex_scores = []
    assoc_scores = []

    for _, row in df.iterrows():
        word1 = row['word1']
        word2 = row['word2']
        emb1.append(get_word_embedding(word1))
        emb2.append(get_word_embedding(word2))
        simlex_scores.append(row['SimLex999'])
        assoc_scores.append(row['Assoc(USF)'])
    
    # print(emb1[0].shape)
    emb1_stack = torch.stack(emb1)
    emb2_stack = torch.stack(emb2)
    
    return emb1_stack, emb2_stack, torch.tensor(simlex_scores, dtype=torch.float), torch.tensor(assoc_scores, dtype=torch.float)

In [18]:
# Call create_dataset
train_df, test_df = train_test_split(ndf, test_size=0.1, random_state=42)
train_emb1, train_emb2, train_simlex_scores, train_assoc_scores = create_dataset(train_df)
test_emb1, test_emb2, test_simlex_scores, test_assoc_scores = create_dataset(test_df)

In [19]:
# check train_emb1
print(train_emb1.shape)
print(train_simlex_scores.shape)

torch.Size([899, 300])
torch.Size([899])


In [20]:
# Creare TensorDataset
train_dataset = torch.utils.data.TensorDataset(train_emb1, train_emb2, train_simlex_scores, train_assoc_scores)
test_dataset = torch.utils.data.TensorDataset(test_emb1, test_emb2, test_simlex_scores, test_assoc_scores)

In [21]:
# Create DataLoader
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=32, shuffle=True)

In [22]:
# Class that takes CBOW embeddings, and outputs similarity scores: loss is MSE between predicted similarity scores and actual similarity scores(Simlex-999)
class RegressionModel(nn.Module):
    def __init__(self, embedding_dim):
        super(RegressionModel, self).__init__()
        self.linear1 = nn.Linear(2*embedding_dim, 300)
        self.linear2 = nn.Linear(300, 100)
        self.linear3 = nn.Linear(100, 50)
        self.linear4 = nn.Linear(50, 1)
        self.dropout = nn.Dropout(0.5)

    def forward(self, emb1, emb2):
        # emb1 = emb1.squeeze()
        # emb2 = emb2.squeeze()
        emb = torch.cat((emb1, emb2), dim=1)

        out = self.linear1(emb)
        out = F.relu(out)
        out = self.dropout(out)
        out = self.linear2(out)
        out = F.relu(out)
        out = self.dropout(out)
        out = self.linear3(out)
        out = F.relu(out)
        out = self.dropout(out)
        out = self.linear4(out)

        # Project the output between 0 and 10
        out = torch.sigmoid(out)
        out = out*10
        return out

In [None]:
# Model Parameters
embedding_dim = 300
learning_rate = 0.001
num_epochs = 100
batch_size = 32

# Initialize model
model = RegressionModel(embedding_dim).to(device)
# Define loss function
criterion = nn.MSELoss()
# Define optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=0.01) # weight_decay is L2 regularization

In [24]:
# Function to train model
def train(model, train_loader, criterion, optimizer, num_epochs):
    model.train()
    train_loss = 0.0
    for epoch in range(num_epochs):
        for emb1, emb2, simlex_scores, assoc_scores in train_loader:
            emb1 = emb1.to(device)
            emb2 = emb2.to(device)
            simlex_scores = simlex_scores.to(device)
            assoc_scores = assoc_scores.to(device)
            # Forward pass
            outputs = model(emb1, emb2)
            
            simlex_scores = simlex_scores.unsqueeze(1)
            # print(outputs[0])
            loss = criterion(outputs, simlex_scores)
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
            
        train_loss /= len(train_loader)
        print("Epoch: {}, Train_Loss: {}".format(epoch+1, train_loss))

In [25]:
# Train model
train(model, train_loader, criterion, optimizer, num_epochs)

Epoch: 1, Train_Loss: 6.82242201114523
Epoch: 2, Train_Loss: 7.082542566850551
Epoch: 3, Train_Loss: 6.688761798417252
Epoch: 4, Train_Loss: 5.538591963391047
Epoch: 5, Train_Loss: 4.565945429443646
Epoch: 6, Train_Loss: 3.805899424211709
Epoch: 7, Train_Loss: 3.7285589611001577
Epoch: 8, Train_Loss: 2.816798715877354
Epoch: 9, Train_Loss: 2.2790207222819268
Epoch: 10, Train_Loss: 2.0375593467543864
Epoch: 11, Train_Loss: 2.0096794762967316
Epoch: 12, Train_Loss: 1.7291252462300704
Epoch: 13, Train_Loss: 1.5648862323823778
Epoch: 14, Train_Loss: 1.4902625769091173
Epoch: 15, Train_Loss: 1.3206737023993214
Epoch: 16, Train_Loss: 1.134516274046637
Epoch: 17, Train_Loss: 1.0681309568166644
Epoch: 18, Train_Loss: 1.1408377942932058
Epoch: 19, Train_Loss: 1.1782147783571701
Epoch: 20, Train_Loss: 1.2133882268356668
Epoch: 21, Train_Loss: 1.0943431512517745
Epoch: 22, Train_Loss: 1.0167855413343818
Epoch: 23, Train_Loss: 0.9439069693512832
Epoch: 24, Train_Loss: 0.731580727025788
Epoch: 25, 

In [26]:
# Function to test model, Calculate test loss and Spearman correlation
def test(model, test_loader, criterion):
    model.eval()
    test_loss = 0.0
    true_simlex_scores = []
    pred_simlex_scores = []
    for emb1, emb2, simlex_scores, assoc_scores in test_loader:
        emb1 = emb1.to(device)
        emb2 = emb2.to(device)
        simlex_scores = simlex_scores.to(device)
        assoc_scores = assoc_scores.to(device)
        # Forward pass
        outputs = model(emb1, emb2)
        simlex_scores = simlex_scores.unsqueeze(1)
        loss = criterion(outputs, simlex_scores)
        test_loss += loss.item()

        # Get true labels and predicted labels
        true_simlex_scores.extend(simlex_scores.cpu().detach().numpy().tolist())
        pred_simlex_scores.extend(outputs.cpu().detach().numpy().tolist())

    test_loss /= len(test_loader)
    print("Test_Loss: {}".format(test_loss))
    # Calculate Spearman correlation
    # print("True Simlex scores: ", true_simlex_scores)
    # print("Predicted Simlex scores: ", pred_simlex_scores)

    true_simlex_scores = np.array(true_simlex_scores)
    pred_simlex_scores = np.array(pred_simlex_scores)
    spear = spearmanr(true_simlex_scores, pred_simlex_scores)
    print("Spearman correlation: {}".format(spear[0]))

In [27]:
# Test model
test(model, test_loader, criterion)

Test_Loss: 8.3381667137146
Spearman correlation: 0.17455226289730819
