In [1]:
import os
import re
import nltk
import pickle
import scipy
import sklearn as sk
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer

EMBEDDING_DIM = 300

In [2]:
def get_dict(file_name):
    """
    This function returns the english to french dictionary given a file where the each column corresponds to a word.
    Check out the files this function takes in your workspace.
    """
    my_file = pd.read_csv(file_name, delimiter=' ')
    etof = {}  # the english to french dictionary to be returned
    for i in range(len(my_file)):
        # indexing into the rows.
        en = my_file.loc[i][0]
        fr = my_file.loc[i][1]
        etof[en] = fr

    return etof

en_embeddings_subset = pickle.load(open("en_embeddings.p", "rb"))
fr_embeddings_subset = pickle.load(open("fr_embeddings.p", "rb"))

# loading the english to french dictionaries
en_fr_train = get_dict('en-fr.train.txt')
print('The length of the English to French training dictionary is', len(en_fr_train))
en_fr_test = get_dict('en-fr.test.txt')
print('The length of the English to French test dictionary is', len(en_fr_train))

The length of the English to French training dictionary is 5000
The length of the English to French test dictionary is 5000


In [20]:
def prepare_data(en_fr_train, en_embeddings_subset, fr_embeddings_subset):
    X = []
    Y = []
    en_words = []
    fr_words = []
    for en_word, fr_word in en_fr_train.items():
        if en_word in en_embeddings_subset and fr_word in fr_embeddings_subset:
            X.append(en_embeddings_subset[en_word])
            en_words.append(en_word)
            Y.append(fr_embeddings_subset[fr_word])
            fr_words.append(fr_word)
            
    return np.vstack(X), np.vstack(Y), en_words, fr_words

train_x, train_y, en_train, fr_train = prepare_data(en_fr_train, en_embeddings_subset, fr_embeddings_subset)
test_x, test_y, en_test, fr_test = prepare_data(en_fr_test, en_embeddings_subset, fr_embeddings_subset)

In [21]:
train_x.shape, train_y.shape, test_x.shape, test_y.shape

((4932, 300), (4932, 300), (1438, 300), (1438, 300))

In [14]:
NUM_ITER = 500
BATCH_SIZE = 32
def train(train_x, train_y):
    R = np.random.rand(EMBEDDING_DIM, EMBEDDING_DIM)
    print(R.shape)
    num_batches = np.ceil(train_x.shape[0]/BATCH_SIZE)
    
    for iter_idx in range(NUM_ITER):
        start_idx = 0
        end_idx = start_idx + BATCH_SIZE
        loss = 0
        for batch_idx in range(int(num_batches)):
            x_batch = train_x[start_idx:end_idx, :]
            y_batch = train_y[start_idx:end_idx, :]
            
            loss += np.sum((np.dot(x_batch, R)-y_batch)**2)/train_x.shape[0]
            start_idx += BATCH_SIZE
            end_idx += BATCH_SIZE
            
            R -= np.dot(x_batch.transpose(),np.dot(x_batch,R)-y_batch)*(2/train_x.shape[0])
        
        if (iter_idx + 1)%25 == 0:
            print(f"The cost after iter: {iter_idx} is {loss}")
    
    return R

R = train(train_x, train_y)

(300, 300)
The cost after iter: 24 is 73.26030145149589
The cost after iter: 49 is 16.789929834251637
The cost after iter: 74 is 5.571954207315096
The cost after iter: 99 is 2.4418295980738693
The cost after iter: 124 is 1.3639677279886382
The cost after iter: 149 is 0.9362463934753547
The cost after iter: 174 is 0.749032207248805
The cost after iter: 199 is 0.6610885484583374
The cost after iter: 224 is 0.6174871317802428
The cost after iter: 249 is 0.5949090135050761
The cost after iter: 274 is 0.582783302467253
The cost after iter: 299 is 0.5760648079309624
The cost after iter: 324 is 0.5722409902895709
The cost after iter: 349 is 0.570013943662149
The cost after iter: 374 is 0.568691179930333
The cost after iter: 399 is 0.5678924068630982
The cost after iter: 424 is 0.5674033310590154
The cost after iter: 449 is 0.5671004146861212
The cost after iter: 474 is 0.5669110058455283
The cost after iter: 499 is 0.5667916372153659


1. Each plane divides the space to $2$ parts.
2. So $n$ planes divide the space into $2^{n}$ hash buckets.
3. We want to organize 10,000 document vectors into buckets so that every bucket has about $~16$ vectors.
4. For that we need $\frac{10000}{16}=625$ buckets.
5. We're interested in $n$, number of planes, so that $2^{n}= 625$. Now, we can calculate $n=\log_{2}625 = 9.29 \approx 10$

In [22]:
def cosine_similarity(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) *np.linalg.norm(v))

In [38]:
def k_nearest_neighbours(v, candidates, k=1):
    
    neighbours = []
    for candidate in candidates:
        sim = cosine_similarity(v, candidate)
        neighbours.append(sim)
        
    sorted_idxs = np.argsort(neighbours)
    
    return sorted_idxs[-k:]

In [53]:
# predict for a word
def predict(word, word_vec, R, candidates, fr_word_list):
    print(word_vec.shape, R.shape)
    word_idx = k_nearest_neighbours(np.dot(word_vec, R), candidates, k=10)

    print(word)
    print([fr_test[int(idx)] for idx in word_idx])
    
predict(en_test[30], test_x[30], R, test_y, fr_test)
print(fr_test[30])

(300,) (300, 300)
worn
['laiton', 'épaule', 'portait', 'capuchon', 'sculptés', 'porté', 'rembourrage', 'nettoyés', 'vêtement', 'habillés']
porté


In [54]:
def test_accuracy(test_x, test_y):
    preds = np.dot(test_x, R)
    correct = 0
    for i in range(len(test_x)):
        pred_idx = k_nearest_neighbours(preds[i], test_y)
        
        if int(pred_idx) == i:
            correct += 1
            
    return correct*100/len(test_x)
            
    
test_accuracy(test_x, test_y)

56.11961057023644