In [1]:
import numpy as np

from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
from gensim.models import fasttext
from get_glove_embeddings import glove

In [2]:
#This piece of code loads word vector from fasttext .bin files and loads them into a text file.
#For english it converts to lowercase before adding, only the first instance is added
#It loads a subset of all words as governed by words_file variable
#In this repo, i've already added txt file so this need not be run. 
#However, if you wish to add new languages from fastText, move bin files here and run this first.
# hi = True
# fasttext_file = "cc.hi.300.bin" if hi else "cc.en.300.bin"
# words_file = "hi_words_reduced" if hi else "eng_words_reduced"
# op_wv_file = "ft.hi.300.txt" if hi else "ft.en.300.txt"
# wv = fasttext.load_facebook_vectors(fasttext_file)

# file_length = sum(1 for i in open(words_file,'rb'))
# done_words = set()
# with open(words_file) as file:
#     with open(op_wv_file,"w") as wv_file:
#         for line in tqdm(file,total=file_length):
#             word = line.strip()
#             if not hi:
#                 if word.lower() in done_words:
#                     continue
#                 done_words.add(word.lower())
#             text = np.array2string(wv[word],max_line_width=9999999,formatter={'float':lambda x: "%.5f" % x})[1:-1]
#             #text = re.sub("\s\s+"," ",text)
#             text = word.lower()+" "+text
#             wv_file.write(text+"\n")

In [3]:
# from https://github.com/babylonhealth/fastText_multilingual/blob/master/align_your_own.ipynb
# from https://stackoverflow.com/questions/21030391/how-to-normalize-array-numpy
def normalized(a, axis=-1, order=2):
    """Utility function to normalize the rows of a numpy array."""
    l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
    l2[l2==0] = 1
    return a / np.expand_dims(l2, axis)

def make_training_matrices(source_dictionary, target_dictionary, bilingual_dictionary):
    """
    Source and target dictionaries are the FastVector objects of
    source/target languages. bilingual_dictionary is a list of 
    translation pair tuples [(source_word, target_word), ...].
    """
    source_matrix = []
    target_matrix = []

    for (source, target) in bilingual_dictionary:
        
        try:
            assert source in source_dictionary
            assert target in target_dictionary
        except AssertionError:
            if (source not in source_dictionary) and (target not in target_dictionary):
                print ("Warning : Couplet not found - ",source,target)
            elif target not in target_dictionary:
                print ("Warning : Target not found - ",target)
            else:
                print ("Warning : Source not found - ",source)
        source_matrix.append(source_dictionary[source])
        target_matrix.append(target_dictionary[target])

    # return training matrices
    return np.array(source_matrix), np.array(target_matrix)

def learn_transformation(source_matrix, target_matrix, normalize_vectors=True):
    """
    Source and target matrices are numpy arrays, shape
    (dictionary_length, embedding_dimension). These contain paired
    word vectors from the bilingual dictionary.
    """
    # optionally normalize the training vectors
    if normalize_vectors:
        source_matrix = normalized(source_matrix)
        target_matrix = normalized(target_matrix)

    # perform the SVD
    product = np.matmul(source_matrix.transpose(), target_matrix)
    U, s, V = np.linalg.svd(product)

    # return orthogonal transformation which aligns source language to the target
    return np.matmul(U, V)

In [4]:
hin_glove = glove("ft.hi.300.txt",validate=False)
eng_glove = glove("ft.en.300.txt",validate=False)

100%|██████████████████████████████████████████████████████████████████████| 1164060/1164060 [03:01<00:00, 6425.06it/s]


Successfully Loaded embedding file.


100%|████████████████████████████████████████████████████████████████████████| 233556/233556 [00:33<00:00, 6903.07it/s]

Successfully Loaded embedding file.





In [5]:
print(cosine_similarity(eng_glove["hyderabad"].reshape(1,-1),hin_glove["हैदराबाद"].reshape(1,-1)))
print(cosine_similarity(eng_glove["pink"].reshape(1,-1),hin_glove["गुलाबी"].reshape(1,-1)))
print(cosine_similarity(eng_glove["thief"].reshape(1,-1),hin_glove["चोर"].reshape(1,-1)))
print("Not in bilingual dict - ",cosine_similarity(eng_glove["clothes"].reshape(1,-1),hin_glove["वस्त्र"].reshape(1,-1)))
print("Not in bilingual dict - ",cosine_similarity(eng_glove["wolf"].reshape(1,-1),hin_glove["भेड़िया"].reshape(1,-1)))
print("Not in bilingual dict - ",cosine_similarity(eng_glove["cricket"].reshape(1,-1),hin_glove["क्रिकेट"].reshape(1,-1)))
print("Not in bilingual dict - ",cosine_similarity(eng_glove["israel"].reshape(1,-1),hin_glove["इज़राइल"].reshape(1,-1)))

[[0.0841448]]
[[0.00244874]]
[[-0.06798166]]
Not in bilingual dict -  [[-0.11828031]]
Not in bilingual dict -  [[-0.03326376]]
Not in bilingual dict -  [[0.09681229]]
Not in bilingual dict -  [[-0.02088189]]


In [6]:
bilingual_dictionary = []

with open("hin_eng_map.csv",encoding="utf-8") as file:
    for line in file:
        line = line.strip()
        bilingual_dictionary.append((line.split(",")[1],line.split(",")[0]))

In [7]:
source_matrix, target_matrix = make_training_matrices(eng_glove, hin_glove, bilingual_dictionary)

In [8]:
transform = learn_transformation(source_matrix, target_matrix)

In [9]:
eng_glove.apply_transform(transform)

In [10]:
print(cosine_similarity(eng_glove["hyderabad"].reshape(1,-1),hin_glove["हैदराबाद"].reshape(1,-1)))
print(cosine_similarity(eng_glove["pink"].reshape(1,-1),hin_glove["गुलाबी"].reshape(1,-1)))
print(cosine_similarity(eng_glove["thief"].reshape(1,-1),hin_glove["चोर"].reshape(1,-1)))
print("Not in bilingual dict - ",cosine_similarity(eng_glove["clothes"].reshape(1,-1),hin_glove["वस्त्र"].reshape(1,-1)))
print("Not in bilingual dict - ",cosine_similarity(eng_glove["wolf"].reshape(1,-1),hin_glove["भेड़िया"].reshape(1,-1)))
print("Not in bilingual dict - ",cosine_similarity(eng_glove["cricket"].reshape(1,-1),hin_glove["क्रिकेट"].reshape(1,-1)))
print("Not in bilingual dict - ",cosine_similarity(eng_glove["israel"].reshape(1,-1),hin_glove["इज़राइल"].reshape(1,-1)))

[[0.5746449]]
[[0.59595558]]
[[0.66035739]]
Not in bilingual dict -  [[0.49474255]]
Not in bilingual dict -  [[0.35297204]]
Not in bilingual dict -  [[0.54519973]]
Not in bilingual dict -  [[0.44262905]]


In [13]:
eng_glove.save_to_file("ft.hi_aligned_en.300.txt")
with open("ft.hi_aligned_en.300.txt","a",encoding="utf-8") as file:
    with open(hin_glove.glove_path,"r",encoding='utf-8') as add_file:
        for line in add_file:
            file.write(line)

100%|████████████████████████████████████████████████████████████████████████| 233556/233556 [03:01<00:00, 1287.32it/s]


In [14]:
eng_hin_glove = glove("ft.hi_aligned_en.300.txt",validate=False)

100%|██████████████████████████████████████████████████████████████████████| 1397616/1397616 [03:44<00:00, 6238.34it/s]


Successfully Loaded embedding file.


In [19]:
eng_hin_glove.most_similar("simple",10) #We see here आसान which was not in the dictionary!

[('straightforward', 0.7797719371890728),
 ('सरल', 0.6859045112119004),
 ('simplistic', 0.6471223587865472),
 ('uncomplicated', 0.6373788345529645),
 ('elegant', 0.5848393424989479),
 ('plain', 0.5418345928929682),
 ('आसान', 0.5410338990547959),
 ('simplicity', 0.5400507554476914),
 ('clever', 0.5178662900313972),
 ('ingenious', 0.5139553814262425)]