In [8]:
import numpy as np
import fasttext
import fasttext.util
from collections import defaultdict
import pandas as pd


In [9]:
# Download and load pre-trained embeddings
# fasttext.util.download_model('en', if_exists='ignore')  # English
# fasttext.util.download_model('hi', if_exists='ignore')  # Hindi

# Load models
en_model = fasttext.load_model('cc.en.300.bin')
hi_model = fasttext.load_model('cc.hi.300.bin')

In [10]:
def get_top_words(model, num_words=100000):
    return {word: model[word] for word in model.get_words()[:num_words]}

en_vocab = get_top_words(en_model)
hi_vocab = get_top_words(hi_model)

In [19]:
# Load MUSE bilingual dictionary
def load_muse_dict(filepath):
    word_pairs = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            parts = line.strip().split('\t')  # Split on tab since that's the delimiter in the file
            if len(parts) != 2:
                print(f"Skipping line {i+1} - incorrect format")
                continue
            en_word, hi_word = parts
            if en_word in en_vocab and hi_word in hi_vocab:
                word_pairs.append((en_word, hi_word))
    return word_pairs

# Example: Assume we have the MUSE dataset for English-Hindi
muse_dict_path = 'en-hi.txt'
bilingual_dict = load_muse_dict(muse_dict_path)

# Convert to NumPy arrays for alignment
X = np.array([en_vocab[pair[0]] for pair in bilingual_dict])
Y = np.array([hi_vocab[pair[1]] for pair in bilingual_dict])


In [22]:
def procrustes_alignment(X, Y):
    # Compute the cross-covariance matrix
    M = Y.T @ X
    
    # Compute SVD
    U, _, Vt = np.linalg.svd(M)
    
    # Compute optimal transformation
    W = U @ Vt
    return W

W = procrustes_alignment(X, Y)

# Apply transformation to English embeddings
X_aligned = X @ W


In [25]:
from sklearn.metrics.pairwise import cosine_similarity

# Convert Hindi embeddings to a search space
hi_matrix = np.array([hi_vocab[word] for word in hi_vocab.keys()])
hi_words = list(hi_vocab.keys())

# Compute cosine similarity
def get_nearest_neighbors(embedded_word, k=5):
    similarities = cosine_similarity([embedded_word], hi_matrix)[0]
    nearest_indices = np.argsort(similarities)[::-1][:k]
    return [hi_words[idx] for idx in nearest_indices]

# Evaluate Precision@1 and Precision@5
def evaluate_translation(X_aligned, bilingual_dict):
    correct_at_1, correct_at_5 = 0, 0
    total = len(bilingual_dict)
    
    for i, (en_word, hi_word) in enumerate(bilingual_dict):
        nearest_neighbors = get_nearest_neighbors(X_aligned[i], k=5)
        
        if hi_word == nearest_neighbors[0]:  # Precision@1
            correct_at_1 += 1
        
        if hi_word in nearest_neighbors:  # Precision@5
            correct_at_5 += 1
    
    precision_at_1 = correct_at_1 / total
    precision_at_5 = correct_at_5 / total
    return precision_at_1, precision_at_5

precision_1, precision_5 = evaluate_translation(X_aligned, bilingual_dict)

print(f'Precision@1: {precision_1:.4f}')
print(f'Precision@5: {precision_5:.4f}')


Precision@1: 0.0000
Precision@5: 0.0000


In [26]:
def cosine_sim(word1, word2):
    return cosine_similarity([en_vocab[word1]], [hi_vocab[word2]])[0, 0]

# Example word pairs
example_pairs = [('king', 'राजा'), ('queen', 'रानी'), ('apple', 'सेब')]

for en_word, hi_word in example_pairs:
    if en_word in en_vocab and hi_word in hi_vocab:
        sim = cosine_sim(en_word, hi_word)
        print(f"Cosine Similarity({en_word}, {hi_word}) = {sim:.4f}")


Cosine Similarity(king, राजा) = -0.0519
Cosine Similarity(queen, रानी) = -0.0623
Cosine Similarity(apple, सेब) = 0.1291


In [27]:
for size in [5000, 10000, 20000]:
    subset_dict = bilingual_dict[:size]
    X_sub = np.array([en_vocab[pair[0]] for pair in subset_dict])
    Y_sub = np.array([hi_vocab[pair[1]] for pair in subset_dict])
    
    W_sub = procrustes_alignment(X_sub, Y_sub)
    X_aligned_sub = X_sub @ W_sub
    p1, p5 = evaluate_translation(X_aligned_sub, subset_dict)
    
    print(f"Bilingual Lexicon Size: {size}, Precision@1: {p1:.4f}, Precision@5: {p5:.4f}")


Bilingual Lexicon Size: 5000, Precision@1: 0.0000, Precision@5: 0.0000
Bilingual Lexicon Size: 10000, Precision@1: 0.0000, Precision@5: 0.0000
Bilingual Lexicon Size: 20000, Precision@1: 0.0000, Precision@5: 0.0000
