## Hello vectors

In [None]:
## from the NLP specialiazation and coded by trishit nath thakur

In [None]:
## importing data

import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from utils import get_vectors

In [None]:
data = pd.read_csv('capitals.txt', delimiter=' ')
data.columns = ['city1', 'country1', 'city2', 'country2']

In [None]:
## dataset imported from https://code.google.com/archive/p/word2vec/

In [None]:
import nltk
from gensim.models import KeyedVectors

embeddings = KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary = True)

f = open('capitals.txt', 'r').read()

set_words = set(nltk.word_tokenize(f))

select_words = words = ['king', 'queen', 'oil', 'gas', 'happy', 'sad', 'city', 'town', 'village', 'country', 'continent', 'petroleum', 'joyful']
for w in select_words:
    set_words.add(w)

def get_word_embeddings(embeddings):

    word_embeddings = {}
    for word in embeddings.vocab:
        if word in set_words:
            word_embeddings[word] = embeddings[word]
    return word_embeddings


In [None]:
word_embeddings = pickle.load(open("word_embeddings_subset.p", "rb"))
len(word_embeddings)

In [None]:
## implementing cosine similarity

def cosine_similarity(A, B):
    '''
    Input:
        A: a numpy array to a word vector
        B: A numpy array to a word vector
    Output:
        cos: number that is cosine similarity between A and B.
    '''    
    dot = np.dot(A,B)
    norma = np.sqrt(np.dot(A,A))
    normb = np.sqrt(np.dot(B,B))
    cos = dot / (norma*normb)

    ### END CODE HERE ###
    return cos

In [None]:
## implementing euclidean distance

def euclidean(A, B):
    """
    Input:
        A: a numpy array to a word vector
        B: A numpy array to a word vector
    Output:
        d: number representing the Euclidean distance between A and B.
    """

    d = np.linalg.norm(A-B)

    ### END CODE HERE ###

    return d

In [None]:
## finding country of each capital

def get_country(city1, country1, city2, embeddings):
    """
    Input:
        city1:the capital city of country1
        country1: the country of capital1
        city2: the capital city of country2
        embeddings: a dictionary where the keys are words and values are their embeddings
    Output:
        countries: a dictionary with the most likely country and similarity score
    """
    
    group = set((city1, country1, city2))
    
    city1_emb = word_embeddings[city1] 
    
    country1_emb =  word_embeddings[country1]
    
    city2_emb = word_embeddings[city2]
    
    vec = country1_emb - city1_emb + city2_emb
    
    similarity = -1      # initialise similarity
    
    for word in embedding.keys():
        
        if word not in group:
            
            word_emb = word_embeddings[word]
            
            cur_similarity = cosine_similarity(vec,word_emb)
            
            if cur_similarity > similarity:
                
                similarity = cur_similarity
            
                country = (word, similarity)

    return country

In [None]:
get_country('Athens', 'Greece', 'Cairo', word_embeddings)

In [None]:
## checking model accuracy

def get_accuracy(word_embeddings, data):
    '''
    Input:
        word_embeddings: a dictionary where the key is a word and the value is its embedding
        data: a pandas dataframe containing all the country and capital city pairs
    
    Output:
        accuracy: the accuracy of the model
    '''
    
    num_correct = 0

    for i, row in data.iterrows():
        
        city1 = row['city1']
        
        country1 = row['country1']

        city2 =  row['city2']

        country2 = row['country2']
        
        predicted_country2, _ = get_country(city1,country1,city2,word_embeddings)
        
        if predicted_country2 == country2:
            
            num_correct += 1
            
    m = len(data) # getting number of rows
    
    accuracy = num_correct/m
    
    return accuracy

In [None]:
accuracy = get_accuracy(word_embeddings, data)
print(f"Accuracy is {accuracy:.2f}")

In [None]:
## plotting vectors using PCA

In [None]:
def compute_pca(X, n_components = 2):
    """
    Input:
        X: of dimension (m,n) where each row corresponds to a word vector
        n_components: Number of components you want to keep.
    Output:
        X_reduced: data transformed in 2 dims/columns + regenerated original data
    """
    X_demeaned = X - np.mean(X, axis = 0)      # mean center the data
    
    covariance_matrix = np.cov(X_demeaned, rowvar=False)   # calculate the covariance matrix
    
    eigen_vals, eigen_vecs =  np.linalg.eigh(covariance_matrix, UPLO = 'L') # calculate eigenvectors,eienvalues of the covariance matrix
    
    idx_sorted = np.argsort(eigen_vals)       # sort eigenvalue in increasing order
    
    idx_sorted_decreasing = idx_sorted[::-1] # reverse from highest to lowest
    
    eigen_vals_sorted = eigen_vals[idx_sorted_decreasing]
    
    eigen_vecs_sorted = eigen_vecs[:,idx_sorted_decreasing] # sort eigenvectors using the idx_sorted_decreasing indices
    
    eigen_vecs_subset = eigen_vecs_sorted[:, 0:n_components]
    
    # transform data by multiplying transpose of eigen vectors with transpose of demeaned data
    
    X_reduced = np.dot(eigen_vecs_subset.transpose(), X_demeaned.transpose()).transpose()
    
    return X_reduced

In [None]:
## testing on words

words = ['oil', 'gas', 'happy', 'sad', 'city', 'town',
         'village', 'country', 'continent', 'petroleum', 'joyful']

X = get_vectors(word_embeddings, words)

# plotting

result = compute_pca(X, 2)

plt.scatter(result[:, 0], result[:, 1])

for i, word in enumerate(words):
    plt.annotate(word, xy =(result[i, 0] - 0.05, result[i, 1] + 0.1))
    
plt.show()