In [1]:
import numpy as np
import io

In [2]:
def load_embeddings(file_path, max_vocab=100000):
    """
    Loads FastText word embeddings from a file.
    
    Args:
        file_path (str): Path to the FastText embeddings file
        max_vocab (int): Maximum number of word embeddings to load
        
    Returns:
        dict: Dictionary mapping words to their vector embeddings
    """
    embeddings = {}
    with io.open(file_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)  # Skip header
        for idx, line in enumerate(f):
            if idx >= max_vocab:
                break
            tokens = line.rstrip().split(' ')
            word = tokens[0]
            vector = np.array(tokens[1:], dtype=np.float32)
            embeddings[word] = vector
    return embeddings

In [3]:
def load_bilingual_lexicon(file_path, en_embeddings, hi_embeddings):
    """
    Loads a bilingual lexicon and filters word pairs based on available embeddings.
    
    Args:
        file_path (str): Path to the bilingual lexicon file
        en_embeddings (dict): English word embeddings dictionary
        hi_embeddings (dict): Hindi word embeddings dictionary
        
    Returns:
        tuple: Lists of aligned English and Hindi words that exist in both embedding spaces
    """
    en_words = []
    hi_words = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            parts = line.strip().split()
            if len(parts) > 2:
                continue
            en_word, hi_word = line.strip().split()
            if en_word in en_embeddings and hi_word in hi_embeddings:
                en_words.append(en_word)
                hi_words.append(hi_word)
    return en_words, hi_words

In [4]:
def procrustes_alignment(en_embeddings, hi_embeddings, en_lexicon, hi_lexicon):
    """
    Performs Procrustes alignment to find the optimal linear transformation between two embedding spaces.
    
    Args:
        en_embeddings (dict): English word embeddings
        hi_embeddings (dict): Hindi word embeddings  
        en_lexicon (list): English words from bilingual lexicon
        hi_lexicon (list): Hindi words from bilingual lexicon
        
    Returns:
        ndarray: The learned transformation matrix W
    """
    en_matrix = np.array([en_embeddings[word] for word in en_lexicon])
    hi_matrix = np.array([hi_embeddings[word] for word in hi_lexicon])

    # Normalize the embeddings
    en_matrix = en_matrix / np.linalg.norm(en_matrix, axis=1)[:, np.newaxis]
    hi_matrix = hi_matrix / np.linalg.norm(hi_matrix, axis=1)[:, np.newaxis]

    # Calculate transformation matrix using SVD
    U, _, Vt = np.linalg.svd(np.dot(en_matrix.T, hi_matrix))
    W = np.dot(U, Vt)
    return W

In [5]:
def apply_mapping(en_embeddings, W):
    """
    Applies the learned transformation matrix to English embeddings.
    
    Args:
        en_embeddings (dict): English word embeddings
        W (ndarray): Transformation matrix from Procrustes alignment
        
    Returns:
        dict: Transformed English embeddings aligned to Hindi embedding space
    """
    aligned_en_embeddings = {}
    for word, vector in en_embeddings.items():
        aligned_en_embeddings[word] = np.dot(vector, W)
    return aligned_en_embeddings

In [6]:
def translate_word(en_word, aligned_en_embeddings, hi_embeddings, k=5):
    """
    Translates an English word to Hindi by finding nearest neighbors in the aligned embedding space.
    
    Args:
        en_word (str): English word to translate
        aligned_en_embeddings (dict): Aligned English embeddings
        hi_embeddings (dict): Hindi embeddings
        k (int): Number of translation candidates to return
        
    Returns:
        list: Top k Hindi translation candidates
    """
    if en_word not in aligned_en_embeddings:
        return []
    en_vector = aligned_en_embeddings[en_word]
    similarities = {}
    for hi_word, hi_vector in hi_embeddings.items():
        similarities[hi_word] = np.dot(en_vector, hi_vector) / (np.linalg.norm(en_vector) * np.linalg.norm(hi_vector))
    sorted_translations = sorted(similarities.items(), key=lambda x: x[1], reverse=True)
    return [word for word, _ in sorted_translations[:k]]

In [7]:
def evaluate_translation(aligned_en_embeddings, hi_embeddings, test_lexicon_path, k=5):
    """
    Evaluates translation accuracy using Precision@k metrics.
    
    Args:
        aligned_en_embeddings (dict): Aligned English embeddings
        hi_embeddings (dict): Hindi embeddings
        test_lexicon_path (str): Path to test bilingual lexicon
        k (int): Maximum number of translation candidates to consider
        
    Returns:
        tuple: Precision@1 and Precision@5 scores
    """
    test_en_words, test_hi_words = load_bilingual_lexicon(test_lexicon_path, en_embeddings, hi_embeddings)
    precision_at_1 = 0
    precision_at_5 = 0
    for en_word, hi_word in zip(test_en_words, test_hi_words):
        translations = translate_word(en_word, aligned_en_embeddings, hi_embeddings, k)
        if hi_word in translations[:1]:
            precision_at_1 += 1
        if hi_word in translations:
            precision_at_5 += 1
    precision_at_1 /= len(test_en_words)
    precision_at_5 /= len(test_en_words)
    return precision_at_1, precision_at_5

In [8]:
def compute_cosine_similarity(word1, word2, embeddings1, embeddings2, W=None):
    """
    Computes cosine similarity between two words in their respective embedding spaces.
    
    Args:
        word1 (str): First word
        word2 (str): Second word
        embeddings1 (dict): First embedding space
        embeddings2 (dict): Second embedding space
        W (ndarray, optional): Transformation matrix to align embeddings1 to embeddings2
        
    Returns:
        float: Cosine similarity between the word vectors
    """
    if W is not None:
      embeddings1[word1] = np.dot(embeddings1[word1], W)
    vec1 = embeddings1[word1]
    vec2 = embeddings2[word2]
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [9]:
def ablation_study(en_embeddings, hi_embeddings, lexicon_path, test_lexicon_path, sizes=[5000, 10000, 20000]):
    """
    Conducts an ablation study to analyze performance with different lexicon sizes.
    
    Args:
        en_embeddings (dict): English embeddings
        hi_embeddings (dict): Hindi embeddings
        lexicon_path (str): Path to training bilingual lexicon
        test_lexicon_path (str): Path to test bilingual lexicon
        sizes (list): Different lexicon sizes to test
        
    Returns:
        dict: Dictionary mapping lexicon sizes to (Precision@1, Precision@5) scores
    """
    results = {}
    for size in sizes:
        en_lexicon, hi_lexicon = load_bilingual_lexicon(lexicon_path, en_embeddings, hi_embeddings)
        en_lexicon = en_lexicon[:size]
        hi_lexicon = hi_lexicon[:size]
        W = procrustes_alignment(en_embeddings, hi_embeddings, en_lexicon, hi_lexicon)
        aligned_en_embeddings = apply_mapping(en_embeddings, W)
        precision_1, precision_5 = evaluate_translation(aligned_en_embeddings, hi_embeddings, test_lexicon_path, 5)
        results[size] = (precision_1, precision_5)
    return results

Load word embeddings for English and Hindi

In [10]:
en_embeddings = load_embeddings("wiki.en.vec", max_vocab=100000) #Replace with your path
hi_embeddings = load_embeddings("wiki.hi.vec", max_vocab=100000) #Replace with your path

In [11]:
print(len(en_embeddings))
print(len(hi_embeddings))

100000
100000


Load bilingual lexicon and perform alignment

In [12]:
en_lexicon, hi_lexicon = load_bilingual_lexicon("en-hi.txt", en_embeddings, hi_embeddings)

In [13]:
W = procrustes_alignment(en_embeddings, hi_embeddings, en_lexicon, hi_lexicon)
aligned_en_embeddings = apply_mapping(en_embeddings, W)

Evaluate translation performance

In [14]:
precision_1, precision_5 = evaluate_translation(aligned_en_embeddings, hi_embeddings, "en-hi.txt", 5)
print(f"Precision@1: {precision_1}")
print(f"Precision@5: {precision_5}")

Precision@1: 0.5934244791666666
Precision@5: 0.7805989583333334


Test with example word pair

In [15]:
sim = compute_cosine_similarity("hello","नमस्ते", en_embeddings, hi_embeddings, W)
print(f"Cosine similarity: {sim}")

Cosine similarity: 0.3127928376197815


Perform ablation study

In [16]:
ablation_results = ablation_study(en_embeddings, hi_embeddings, "en-hi.txt", "en-hi.txt")
print("Ablation Study Results:")
for size, (p1, p5) in ablation_results.items():
    print(f"Size: {size}, Precision@1: {p1}, Precision@5: {p5}")

Ablation Study Results:
Size: 5000, Precision@1: 0.5940755208333334, Precision@5: 0.7835286458333334
Size: 10000, Precision@1: 0.5940755208333334, Precision@5: 0.7835286458333334
Size: 20000, Precision@1: 0.5940755208333334, Precision@5: 0.7835286458333334
