In [1]:
import pickle
import string
import time
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords, twitter_samples
from nltk.tokenize import TweetTokenizer
from functions import get_dict,cosine_similarity

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Vidit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
en_embeddings = pickle.load(open("en_embeddings.p", "rb"))
fr_embeddings = pickle.load(open("fr_embeddings.p", "rb"))
# both of them are dictionaries with keys as words are values as 300 dimension vectors

In [3]:
en_fr_train = get_dict('en-fr.train.txt')
print('The length of the English to French training dictionary is', len(en_fr_train))
en_fr_test = get_dict('en-fr.test.txt')
print('The length of the English to French test dictionary is', len(en_fr_train))

The length of the English to French training dictionary is 5000
The length of the English to French test dictionary is 5000


In [4]:
en_fr_train # English Words with their corresponding French Words

{'the': 'la',
 'and': 'et',
 'was': 'était',
 'for': 'pour',
 'that': 'cela',
 'with': 'avec',
 'from': 'depuis',
 'this': 'ce',
 'utc': 'tuc',
 'his': 'son',
 'not': 'pas',
 'are': 'sont',
 'talk': 'parlez',
 'which': 'lequel',
 'also': 'egalement',
 'were': 'étaient',
 'but': 'mais',
 'have': 'ont',
 'one': 'one',
 'new': 'nouveautés',
 'first': 'premiers',
 'page': 'page',
 'you': 'you',
 'they': 'eux',
 'had': 'avais',
 'article': 'article',
 'who': 'who',
 'all': 'all',
 'their': 'leurs',
 'there': 'là',
 'made': 'fabriqué',
 'its': 'son',
 'people': 'personnes',
 'may': 'peut',
 'after': 'aprés',
 'other': 'autres',
 'should': 'devrais',
 'two': 'deux',
 'score': 'partition',
 'her': 'her',
 'can': 'peut',
 'would': 'ferait',
 'more': 'plus',
 'she': 'elle',
 'when': 'quand',
 'time': 'heure',
 'team': 'equipe',
 'american': 'américains',
 'such': 'telles',
 'discussion': 'débat',
 'links': 'liens',
 'only': 'seule',
 'some': 'quelques',
 'see': 'vois',
 'united': 'unies',
 'year

In [5]:
def get_matrices(en_fr,en_embeddings,fr_embeddings):
    X = []
    Y = []
    
    for e_words,f_words in en_fr.items():
        if e_words in en_embeddings.keys() and f_words in fr_embeddings:
            X.append(en_embeddings[e_words])
            Y.append(fr_embeddings[f_words])
            
    E = np.vstack(X)
    F = np.vstack(Y)
    return E,F

In [6]:
X_train, Y_train = get_matrices(en_fr_train, fr_embeddings, en_embeddings)

In [7]:
def compute_loss(X,Y,R):
    m = X.shape[0]
    Y_pred = np.dot(X,R)
    diff = Y_pred - Y
    loss = (np.sum(diff*diff))/m
    return loss

In [8]:
def compute_gradient(X,Y,R):
    m = X.shape[0]
    grad = np.dot(X.transpose(),(np.dot(X,R)-Y))*(2/m) # this is the formula of derivative of loss with respect to R
    return grad

In [9]:
def grad_descent(X,Y,steps,lrate):
    R = np.random.rand(X.shape[1], X.shape[1])
    for i in range(steps):
        loss = compute_loss(X,Y,R)
        if(i%25==0):
            print(f"Loss at {i}th iteration is {loss}")
        gradient = compute_gradient(X,Y,R)
        R -= (lrate*gradient)
    return R

In [10]:
R_train = grad_descent(X_train, Y_train, 500, 1)

Loss at 0th iteration is 113.30008893898864
Loss at 25th iteration is 56.632300102304384
Loss at 50th iteration is 42.004268584682166
Loss at 75th iteration is 32.73379445505898
Loss at 100th iteration is 26.404823698836655
Loss at 125th iteration is 21.894063796837983
Loss at 150th iteration is 18.572910006726687
Loss at 175th iteration is 16.062116009385157
Loss at 200th iteration is 14.121378132493131
Loss at 225th iteration is 12.592611738076654
Loss at 250th iteration is 11.36856655664937
Loss at 275th iteration is 10.374533836630802
Loss at 300th iteration is 9.55725119860565
Loss at 325th iteration is 8.877953086310708
Loss at 350th iteration is 8.307897443499304
Loss at 375th iteration is 7.82541618993656
Loss at 400th iteration is 7.413927617014762
Loss at 425th iteration is 7.0605699496885
Loss at 450th iteration is 6.755244449544945
Loss at 475th iteration is 6.4899337953196445


In [11]:
def nearest_neighbor(v,sv,k):
    i = []
    for row in sv:
        i.append(cosine_similarity(v,row))
    i_sorted = np.argsort(i)
    k_i = i_sorted[-k:]
    return k_i

In [12]:
def test(X,Y,R):
    pred = np.dot(X,R)
    num_correct = 0

    for i in range(len(pred)):
        pred_idx = nearest_neighbor(pred[i],Y,1)
        if pred_idx == i:
            num_correct += 1
            
    accuracy = num_correct / len(pred)

    return accuracy

In [13]:
X_val, Y_val = get_matrices(en_fr_test, fr_embeddings, en_embeddings)

In [14]:
acc = test(X_val, Y_val, R_train)
print(f"Accuracy on test set is {acc*100}%")
# Later on we will see how to increase this

Accuracy on test set is 31.712962962962965%
