In [46]:
import nltk
nltk.download('stopwords')
nltk.download('twitter_samples')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package twitter_samples to /root/nltk_data...
[nltk_data]   Package twitter_samples is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
import numpy as np
import pandas as pd
import string
import re
import scipy
import sklearn
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize, TweetTokenizer
import gensim
from gensim.models import KeyedVectors
import pdb
import pickle

In [12]:
def process_tweet(tweet):
  tweet=re.sub(r'\$\w*','',tweet)
  tweet=re.sub(r'https?:\/\/.*[\r\n]*','',tweet)
  tweet=re.sub(r'#','',tweet)
  tweet = re.sub(r'^RT[\s]+', '', tweet)
  stemmer=PorterStemmer()
  clean_word=[]
  tokenizer=TweetTokenizer(preserve_case=False,reduce_len=True,strip_handles=True)
  words=tokenizer.tokenize(tweet)\

  for word in words:
    if word not in stopwords.words('english') and word not in string.punctuation:
      stem_word=stemmer.stem(word)
      clean_word.append(stem_word)
  return clean_word

In [13]:
a='What the weather like today I think it is sunny'
print(process_tweet(a))

['weather', 'like', 'today', 'think', 'sunni']


In [5]:

def get_dict(path):
  data=pd.read_csv(path,delimiter=' ')
  dict={}
  for i in range((len(data))):
    eng=data.loc[i][0]
    france=data.loc[i][1]
    dict[eng]=france

  return dict


In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
folder_path='/content/drive/My Drive/en-fr.train.txt'
en_fra_train=get_dict(folder_path)
en_fra_test=get_dict('/content/drive/My Drive/en-fr.test.txt')

In [26]:
def cos_of_two_vector(v1,v2):
  a=np.dot(v1,v2)
  n1=np.linalg.norm(v1)
  n2=np.linalg.norm(v2)
  return a/(n1*n2)


In [9]:
import requests
from gensim.models import KeyedVectors

url = 'https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.fr.vec'
response = requests.get(url)
open('wiki.multi.fr.vec', 'wb').write(response.content)
fr_embeddings = KeyedVectors.load_word2vec_format('wiki.multi.fr.vec')

In [11]:
url = 'https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.en.vec'
response = requests.get(url)
open('wiki.multi.en.vec', 'wb').write(response.content)
en_embeddings = KeyedVectors.load_word2vec_format('wiki.multi.en.vec')

In [19]:
english_set = set(en_embeddings.key_to_index.keys())
french_set = set(fr_embeddings.key_to_index.keys())
en_embeddings_subset = {}
fr_embeddings_subset = {}
french_words = set(en_fra_train.values())
for en_word in en_fra_train.keys():
    fr_word = en_fra_train[en_word]
    if fr_word in french_set and en_word in english_set:
        en_embeddings_subset[en_word] = en_embeddings[en_word]
        fr_embeddings_subset[fr_word] = fr_embeddings[fr_word]


for en_word in en_fra_test.keys():
    fr_word = en_fra_test[en_word]
    if fr_word in french_set and en_word in english_set:
        en_embeddings_subset[en_word] = en_embeddings[en_word]
        fr_embeddings_subset[fr_word] = fr_embeddings[fr_word]


pickle.dump( en_embeddings_subset, open( "en_embeddings.p", "wb" ) )
pickle.dump( fr_embeddings_subset, open( "fr_embeddings.p", "wb" ) )

In [20]:
def get_matrix(en_fr,en_vec,fr_vec):
  x_list=list()
  y_list=list()
  english_set=en_vec.keys()
  france_set=fr_vec.keys()
  france_words=set(en_fr.values())
  for en_word, fr_word in en_fr.items():
        if fr_word in fr_vec and en_word in en_vec:
            en_embedding = en_vec[en_word]
            fr_embedding = fr_vec[fr_word]
            x_list.append(en_embedding)
            y_list.append(fr_embedding)

  X = np.vstack(x_list)
  Y = np.vstack(y_list)
  return X, Y


In [21]:
en_embeddings_subset = pickle.load(open("en_embeddings.p", "rb"))
fr_embeddings_subset = pickle.load(open("fr_embeddings.p", "rb"))
x_train,y_train=get_matrix(en_fra_train,en_embeddings_subset,fr_embeddings_subset)

In [22]:
x_train=np.array(x_train)
y_train=np.array(y_train)
x_train.shape, y_train.shape

((5000, 300), (5000, 300))

In [23]:
from scipy.linalg import orthogonal_procrustes
R,_=orthogonal_procrustes(x_train,y_train)
R

array([[ 0.80879164, -0.00400219, -0.02734789, ..., -0.02883246,
        -0.01982565,  0.02612439],
       [ 0.03065553,  0.8275611 ,  0.09199339, ...,  0.02687369,
         0.00631984,  0.04344907],
       [ 0.08258763, -0.02589141,  0.8044921 , ...,  0.04021224,
         0.00922842, -0.06730425],
       ...,
       [-0.03478248,  0.03102311, -0.03719375, ...,  0.83638465,
        -0.0234507 ,  0.04584297],
       [ 0.03411265,  0.07941602, -0.05997404, ...,  0.04701069,
         0.84207547, -0.03437557],
       [-0.00637255,  0.00189635,  0.03001869, ..., -0.03473854,
        -0.04046868,  0.8287463 ]], dtype=float32)

In [28]:
def nearest_neighbor(v, candidates, k=1):
    similarity_l = []
    for row in candidates:
        cos_similarity = cos_of_two_vector(v,row)
        similarity_l.append(cos_similarity)
    sorted_ids = np.argsort(similarity_l)
    k_idx = sorted_ids[-k:]
    return k_idx
def test_vocabulary(X, Y, R):
    pred = np.dot(X,R)
    num_correct = 0
    for i in range(len(pred)):
        pred_idx = nearest_neighbor(pred[i],Y)
        if pred_idx == i:
            num_correct += 1
    accuracy = num_correct / len(pred)
    return accuracy

In [29]:
x_test,y_test=get_matrix(en_fra_test,en_embeddings_subset,fr_embeddings_subset)
acc = test_vocabulary(x_test, y_test, R)
print(f"accuracy on test set is {acc:.3f}")

accuracy on test set is 0.807


Translate

In [34]:
def translate_word(word, en_embeddings_subset,R):
    if word not in en_embeddings_subset:
        return None
    en_vector = en_embeddings_subset[word]
    fr_vector = np.dot(R, en_vector)
    return fr_vector

In [35]:
from scipy.spatial.distance import cosine
def find_closest_word(vector, embeddings_subset):
    closest_word = None
    min_distance = float('inf')
    for word, embedding in embeddings_subset.items():
        distance = cosine(vector, embedding)
        if distance < min_distance:
            min_distance = distance
            closest_word = word
    return closest_word

In [49]:
def translate_and_find(word, en_embeddings_subset, fr_embeddings_subset, R):
    fr_vector = translate_word(word, en_embeddings_subset, R)
    if fr_vector is None:
        return None

    closest_word = find_closest_word(fr_vector, fr_embeddings_subset)
    return closest_word

In [61]:
set=['free','house','music','know','field']
translated=[]
for word in set:
 translated_word = translate_and_find(word, en_embeddings_subset, fr_embeddings_subset, R)
 translated.append(translated_word)
translated


['libre', 'maison', 'musique', 'crois', 'champs']