In [1]:
from sklearn.decomposition import TruncatedSVD, PCA
from sklearn.metrics.pairwise import cosine_similarity

import pandas as pd
import numpy as np

from scipy.stats import spearmanr, pearsonr

import ast

In [2]:
def create_co_matrix(vocab: dict, window_size: int, tokenized_sentences: list):
    vocab_size = len(vocab)
    co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.float32)

    for sentence in tokenized_sentences:
        sentence_length = len(sentence)
        for idx, word in enumerate(sentence):
            word_idx = vocab[word]
            
            # Define the context window
            start = max(0, idx - window_size)
            end = min(sentence_length, idx + window_size + 1)
            
            # Update co-occurrence counts for words in the window
            for context_idx in range(start, end):
                if idx != context_idx:  # Skip the word itself
                    context_word_idx = vocab[sentence[context_idx]]
                    co_matrix[word_idx, context_word_idx] += 1

    return co_matrix

def create_sppmi_matrix(co_matrix, co_occurrence_sum, word_occurrences, k):
    sppmi_matrix = np.zeros_like(co_matrix)
    
    # Find indices where co_matrix > 0
    rows, cols = np.nonzero(co_matrix)

    for i, j in zip(rows, cols):
        pmi = np.log((co_matrix[i, j] * co_occurrence_sum) / (word_occurrences[i] * word_occurrences[j]))
        sppmi = pmi - np.log(k)
        sppmi_matrix[i, j] = max(sppmi, 0)  # SPPMI

    return sppmi_matrix

In [3]:
# load data and get sppmi embeddings again using best SVD model found previously
df = pd.read_csv('../data/evaluation/wordsim353crowd.csv')
twitter = pd.read_csv('../data/twitter/processed.csv')

twitter['processed_text'] = twitter['processed_text'].apply(lambda x: ast.literal_eval(x))
tokenized_sentences = list(twitter['processed_text'])

vocab = {
    word: idx for idx, word in enumerate(set(word for sentence in tokenized_sentences for word in sentence))
}

co_matrix = create_co_matrix(vocab, 5, tokenized_sentences)
co_occurrence_sum = np.sum(co_matrix)
word_occurrences = np.sum(co_matrix, axis=1)

# create sppmi matrix
sppmi_matrix = create_sppmi_matrix(co_matrix, co_occurrence_sum, word_occurrences, 5)

# fit svd
print(f'Fitting SVD model with dimensions = {128}, window size = {5}, k = {5}')

svd = TruncatedSVD(n_components=128)
U_k = svd.fit_transform(sppmi_matrix)
Sigma_k = np.diag(svd.singular_values_)
Sigma_k_sqrt = np.sqrt(Sigma_k)
sppmi_embedding = U_k @ Sigma_k_sqrt

Fitting SVD model with dimensions = 128, window size = 5, k = 5


In [4]:
word_to_index = vocab
index_to_word = {idx: word for word, idx in vocab.items()}

In [5]:
preds = []

for _, row in df.iterrows():
    w1, w2 = row['Word 1'], row['Word 2']
    if w1 in word_to_index and w2 in word_to_index:
        vec1 = sppmi_embedding[word_to_index[w1]].reshape(1, -1)
        vec2 = sppmi_embedding[word_to_index[w2]].reshape(1, -1)
        sim = cosine_similarity(vec1, vec2)[0][0]
        preds.append(sim)
    else:
        preds.append(np.nan) # result not available

In [6]:
preds

[nan,
 0.18578702,
 nan,
 0.14085941,
 0.15019304,
 0.14716095,
 0.17585915,
 nan,
 nan,
 nan,
 0.1374014,
 nan,
 nan,
 nan,
 0.4480145,
 0.27687407,
 0.19963533,
 0.22161505,
 0.34761745,
 nan,
 nan,
 0.014682222,
 nan,
 nan,
 0.2210814,
 0.28948128,
 0.33966818,
 0.09318803,
 0.07934096,
 0.27155405,
 nan,
 nan,
 0.098600656,
 nan,
 0.1083464,
 0.55952,
 nan,
 0.08233634,
 0.052690953,
 0.12460677,
 0.11859941,
 0.20324986,
 0.10720265,
 nan,
 0.26724157,
 0.22840117,
 0.3352613,
 0.20089898,
 0.086082816,
 0.35240343,
 0.067614794,
 nan,
 0.047848225,
 0.1989129,
 nan,
 nan,
 nan,
 nan,
 nan,
 0.48950684,
 0.31847793,
 nan,
 0.2604121,
 nan,
 0.5176225,
 0.11978251,
 nan,
 0.30502585,
 nan,
 0.08127186,
 nan,
 nan,
 nan,
 0.17026797,
 0.40860453,
 nan,
 0.26700854,
 nan,
 0.22248946,
 nan,
 nan,
 nan,
 0.097981855,
 nan,
 nan,
 nan,
 0.08082784,
 nan,
 nan,
 nan,
 0.07917957,
 nan,
 0.3726133,
 0.21490696,
 nan,
 nan,
 0.3611132,
 0.0849912,
 0.38504252,
 0.10786206,
 0.29705402,
 0

In [7]:
df['preds'] = preds

In [8]:
df = df.dropna() # remove the words that are not part of our word2vec's vocabulary

In [None]:
print("Spearman correlation:", spearmanr(df['preds'], df['Human (Mean)'])[0])
print("Pearson correlation:", pearsonr(df['preds'], df['Human (Mean)'])[0])

Spearman correlation: 0.07769054899203413
Pearson correlation: 0.15669632206080575


We can see that there is low correlation between the similarity scores from the SPPMI-SVD vectors and the human scores, indicating poor performance of the model.

This is expected as the 10k tweets we used probably did not contain enough data