# Testing Sentence Embeddings
In this notebook we illustrate that running PCA on a collection of sentence embeddings prior to feeding the embeddings into a classifier produces better results than no pca or pca performed during generating the sentence level embeddings. We use data from the
Semantic Text Similarity Dataset Hub https://github.com/brmson/dataset-sts and its Semantic Similarity Task.
We will use the GoogleNews embeddings, but these results should be generalizable.

Summary Results, using Mean Square Error:
* PCA used to generate a sentence embedding, Test results: loss 1.189 acc: 1.189
* No PCA, Test results: loss 0.703 acc: 0.703
* PCA only on the collection sentence embeddings, Test results: loss 0.678 acc: 0.678

Note: The data categorizes gold standard as a score between 0 and 5 for each pair of
sentences, with the following interpretation:
* (5) The two sentences are completely equivalent, as they mean the same
    thing.  
* (4) The two sentences are mostly equivalent, but some unimportant
    details differ.
* (3) The two sentences are roughly equivalent, but some important
    information differs/missing.
* (2) The two sentences are not equivalent, but share some details.
* (1) The two sentences are not equivalent, but are on the same topic.
* (0) The two sentences are on different topics.

In [1]:
import random
import os
import sys
from glob import glob
from random import sample
from typing import Optional, Dict
from pathlib import Path

import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from nltk import word_tokenize
import tensorflow as tf

currentdir = Path.cwd()
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir) 
from mlyoucanuse.embeddings import get_embeddings_index

In [2]:
def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = True
    tf.random.set_seed(seed_value)

seed = 42
seed_everything(seed)

In [3]:
token_embedding = get_embeddings_index('GoogleNews', parent_dir=parentdir, embedding_dimensions=300)
sample(list(token_embedding.keys()), 5)

['espresso_martinis', 'wherefore', 'HARD', 'courtly_manners', "Hawai'ian"]

In [4]:
train_files = glob("../data/dataset-sts/data/sts/semeval-sts/2012/*.train.tsv")
test_files = glob("../data/dataset-sts/data/sts/semeval-sts/2012/*.test.tsv")

train_files.append( '../data/dataset-sts/data/sts/semeval-sts/all/2015.val.tsv')
train_files.append( '../data/dataset-sts/data/sts/semeval-sts/all/2015.train.tsv')
test_files.append( '../data/dataset-sts/data/sts/semeval-sts/all/2015.test.tsv')

train_files, test_files

(['../data/dataset-sts/data/sts/semeval-sts/2012/MSRpar.train.tsv',
  '../data/dataset-sts/data/sts/semeval-sts/2012/SMTeuroparl.train.tsv',
  '../data/dataset-sts/data/sts/semeval-sts/all/2015.val.tsv',
  '../data/dataset-sts/data/sts/semeval-sts/all/2015.train.tsv'],
 ['../data/dataset-sts/data/sts/semeval-sts/2012/MSRpar.test.tsv',
  '../data/dataset-sts/data/sts/semeval-sts/2012/SMTeuroparl.test.tsv',
  '../data/dataset-sts/data/sts/semeval-sts/2012/OnWN.test.tsv',
  '../data/dataset-sts/data/sts/semeval-sts/2012/SMTnews.test.tsv',
  '../data/dataset-sts/data/sts/semeval-sts/all/2015.test.tsv'])

In [5]:
all_docs = []
for file in tqdm(train_files + test_files):
    df = pd.read_csv(file, sep='\t', names=['score', 'sent1', 'sent2'], header=None)
    df.dropna(inplace=True)
    for idx, row in df.iterrows():
        all_docs.append(row['sent1'])
        all_docs.append(row['sent2'])

vectorizer = TfidfVectorizer(tokenizer=word_tokenize) 
vectorizer.fit(all_docs)
print(f"size of vocab: {len(vectorizer.vocabulary_):,}")
word_idf = {key: vectorizer.idf_[idx] 
            for key,idx in tqdm(vectorizer.vocabulary_.items(), total=len(vectorizer.idf_))}
del vectorizer

100%|██████████| 9/9 [00:01<00:00,  7.46it/s]
  1%|▏         | 254/18090 [00:00<00:07, 2535.77it/s]

size of vocab: 18,090


100%|██████████| 18090/18090 [00:07<00:00, 2365.30it/s]


In [6]:
# These values are needed while generating sentence embeddings
min_idf = np.min(np.array(list(word_idf.values())))
max_idf = np.max(np.array(list(word_idf.values())))
mean_idf = np.mean(np.array(list(word_idf.values())))

def rescale_idf(val):
    return (val - min_idf) / (max_idf - min_idf)

def compute_pc(X, npc=1):
    """
    Compute the principal components. DO NOT MAKE THE DATA ZERO MEAN!
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: component_[i,:] is the i-th pc
    
    # This has been adapted from the SIF paper code: https://openreview.net/pdf?id=SyK00v5xx
    """
    svd = TruncatedSVD(n_components=npc, n_iter=7, random_state=0)
    svd.fit(X)
    return svd.components_

def remove_pc(X, npc=1):
    """
    Remove the projection on the principal components
    :param X: X[i,:] is a data point
    :param npc: number of principal components to remove
    :return: XX[i, :] is the data point after removing its projection
    
    # This has been adapted from the SIF paper code: https://openreview.net/pdf?id=SyK00v5xx
    """
    pc = compute_pc(X, npc)
    if npc==1:
        XX = X - X.dot(pc.transpose()) * pc
    else:
        XX = X - X.dot(pc.transpose()).dot(pc)
    return XX

def get_sent_embeddings(text:str,
                        word_idf_map:Optional[Dict[str,float]]=None,
                       use_pca=True):
    """
    Provides the weighted average of a sentence's word vectors with the principle component removed.
    
    Expectations:
    Word can only appear once in a sentence, multiple occurrences are collapsed.
    Must have 2 or more embeddings, otherwise Principle Component cannot be found and removed.
    
    """
    if word_idf_map:
        word_idf = word_idf_map
    # else: load pickle file
    tokens = word_tokenize(text)
    embed_map ={
        tok.lower(): 
        (rescale_idf(word_idf.get(tok.lower(), min_idf)), token_embedding.get(tok,0)  )
        for tok in tokens
        if not np.all( (token_embedding.get(tok,0) ==0)) # skip empty embeddings
    }
    words = embed_map.keys()
    weights_embedds = embed_map.values()
    if len(weights_embedds) < 2: # we can't create a sentence embedding for just one word
        return np.zeros(300)    
    weights, embedds = zip(*weights_embedds)
    if sum(weights) == 0:
        return np.zeros(300)    
    embedds = np.array(embedds)    
    if use_pca:
        embedds = remove_pc(embedds)    
    scale_factor = 1 / sum(weights) 
    scaled_vals = np.array([tmp * scale_factor for tmp in weights])
    # apply our weighted terms to the adjusted embeddings
    weighted_embeds = embedds * scaled_vals[:,None]
    mean_wt_embed = np.sum(weighted_embeds, axis=0)
    return mean_wt_embed


In [7]:
def get_xy(pca_flag):
    X =[]
    y = []
    for file in train_files + test_files:
        df = pd.read_csv(file, sep='\t', names=['score', 'sent1', 'sent2'], header=None)
        df.dropna(inplace=True)
        for idx, row in df.iterrows():
            X.append(np.concatenate([
                get_sent_embeddings(row['sent1'], word_idf, use_pca=pca_flag),
                 get_sent_embeddings(row['sent2'], word_idf, use_pca=pca_flag)]))
            y.append(row['score'])
    X = np.array(X)
    y = np.array(y)
    return X, y

In [8]:
X, y = get_xy(pca_flag=True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=.8, 
                                                    random_state=12,
                                                   stratify=np.round(y))
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test,
                                                              train_size=.5,
                                                              random_state=12,
                                                              stratify=np.round(y_test))
 
## Build a simple model
model = models.Sequential([
    layers.Dense(1024, input_shape=(600,), activation='relu'),
    layers.Dense(512, activation='relu'), 
    layers.Dense(1, activation="linear")
])
model.compile(loss='mean_squared_error', 
              optimizer='adam',
              metrics=['mean_squared_error'])
history = model.fit(X_train,
                    y_train,
                    epochs=20,
                    batch_size=128,
                    validation_data=(X_validation, y_validation),
                    verbose=0)
## Evaluate and adjust as necessary
res = model.evaluate(X_train, y_train)
print('Using PCA')
print(f"Train results: loss {res[0]:.3f} acc: {res[1]:.3f}")
res =  model.evaluate(X_validation, y_validation)
print(f"Validation results: loss {res[0]:.3f} acc: {res[1]:.3f}")
res = model.evaluate(X_test, y_test)
print(f"Test results: loss {res[0]:.3f} acc: {res[1]:.3f}")

#######################

X, y = get_xy(pca_flag=False)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    train_size=.8, 
                                                    random_state=12,
                                                   stratify=np.round(y))
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test,
                                                              train_size=.5,
                                                              random_state=12,
                                                              stratify=np.round(y_test))
 
## Build a simple model
model = models.Sequential([
    layers.Dense(1024, input_shape=(600,), activation='relu'),
    layers.Dense(512, activation='relu'), 
    layers.Dense(1, activation="linear")
])
model.compile(loss='mean_squared_error', 
              optimizer='adam',
              metrics=['mean_squared_error'])
history = model.fit(X_train,
                    y_train,
                    epochs=20,
                    batch_size=128,
                    validation_data=(X_validation, y_validation),
                    verbose=0)
## Evaluate and adjust as necessary
res = model.evaluate(X_train, y_train)
print('Without PCA')
print(f"Train results: loss {res[0]:.3f} acc: {res[1]:.3f}")
res =  model.evaluate(X_validation, y_validation)
print(f"Validation results: loss {res[0]:.3f} acc: {res[1]:.3f}")
res = model.evaluate(X_test, y_test)
print(f"Test results: loss {res[0]:.3f} acc: {res[1]:.3f}")

Using PCA
Train results: loss 0.071 acc: 0.071
Validation results: loss 1.125 acc: 1.125
Test results: loss 1.189 acc: 1.189
Without PCA
Train results: loss 0.049 acc: 0.049
Validation results: loss 0.772 acc: 0.772
Test results: loss 0.703 acc: 0.703


In [9]:
print(f"X_train size: {X_train.shape}, X_test size: {X_test.shape}, X_validation size: {X_validation.shape}")

X_train size: (11396, 600), X_test size: (1425, 600), X_validation size: (1425, 600)


# Now let's only run PCA on the collection of sentences
Note: this isn't very practical when creating and dealing with sentence embeddings in isolation.

In [10]:
xone, xtwo = np.hsplit(X, 2)
xcopy = np.vstack([xone, xtwo])
xcopy = remove_pc(xcopy)
x_one, x_two = np.vsplit(xcopy, 2)
Xpca =  np.hstack([x_one, x_two])

In [11]:
X_train, X_test, y_train, y_test = train_test_split(Xpca, y,
                                                    train_size=.8, 
                                                    random_state=12,
                                                   stratify=np.round(y))
X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test,
                                                              train_size=.5,
                                                              random_state=12,
                                                              stratify=np.round(y_test))

## Build a simple model
model = models.Sequential([
    layers.Dense(1024, input_shape=(600,), activation='relu'),
    layers.Dense(512, activation='relu'), 
    layers.Dense(1, activation="linear")
])
model.compile(loss='mean_squared_error', 
              optimizer='adam',
              metrics=['mean_squared_error'])
history = model.fit(X_train,
                    y_train,
                    epochs=20,
                    batch_size=128,
                    validation_data=(X_validation, y_validation),
                    verbose=0)
## Evaluate and adjust as necessary
res = model.evaluate(X_train, y_train)
print('PCA only on the collection')
print(f"Train results: loss {res[0]:.3f} acc: {res[1]:.3f}")
res =  model.evaluate(X_validation, y_validation)
print(f"Validation results: loss {res[0]:.3f} acc: {res[1]:.3f}")
res = model.evaluate(X_test, y_test)
print(f"Test results: loss {res[0]:.3f} acc: {res[1]:.3f}")

PCA only on the collection
Train results: loss 0.046 acc: 0.046
Validation results: loss 0.671 acc: 0.671
Test results: loss 0.678 acc: 0.678


# Summary results

In [13]:
# Using PCA
# Train results: loss 0.071 acc: 0.071
# 45/45 [==============================] - 0s 3ms/step - loss: 1.1252 - mean_squared_error: 1.1252
# Validation results: loss 1.125 acc: 1.125
# 45/45 [==============================] - 0s 2ms/step - loss: 1.1891 - mean_squared_error: 1.1891
# Test results: loss 1.189 acc: 1.189

# Without PCA
# Train results: loss 0.049 acc: 0.049
# 45/45 [==============================] - 0s 3ms/step - loss: 0.7718 - mean_squared_error: 0.7718
# Validation results: loss 0.772 acc: 0.772
# 45/45 [==============================] - 0s 2ms/step - loss: 0.7027 - mean_squared_error: 0.7027
# Test results: loss 0.703 acc: 0.703

# PCA only on the collection
# Train results: loss 0.046 acc: 0.046
# 45/45 [==============================] - 0s 3ms/step - loss: 0.6707 - mean_squared_error: 0.6707
# Validation results: loss 0.671 acc: 0.671
# 45/45 [==============================] - 0s 3ms/step - loss: 0.6778 - mean_squared_error: 0.6778
# Test results: loss 0.678 acc: 0.678