# Assignment Submission - Sarvam Research Fellowship 

I plan to obtain results for these two combinations

1. Pre-trained Fast-text vectors - Supervised Procrustes method
2. Pre-trained Fast-text vectors - Unsupervised method

In [1]:
import fasttext
import numpy as np
from tqdm import tqdm
import faiss
import torch
import torch.nn as nn
import torch.optim as optim
import random

In [2]:
np.random.seed(42)
random.seed(42)

In [3]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## 1. Data Preparation



### 1.1 Pre-trained Fast-text vectors and limit vocab to top 100K most frequent words


In [4]:
def load_embeddings(file_path, max_words=100000):
    model = fasttext.load_model(file_path)
    words = model.get_words()[:max_words] 
    embeddings = {}
    
    for word in words:
        vector = model.get_word_vector(word)
        vector = vector / np.linalg.norm(vector) 
        embeddings[word] = vector
    
    return embeddings

In [5]:
en_embeddings = load_embeddings('./Fasttext_Embeddings/cc.en.300.bin')
hi_embeddings = load_embeddings('./Fasttext_Embeddings/cc.hi.300.bin')
print(f"Loaded {len(en_embeddings)} English and {len(hi_embeddings)} Hindi embeddings.")

Loaded 100000 English and 100000 Hindi embeddings.


###  1.3 Extract a list of word translation pairs from the MUSE dataset 

In [6]:
def load_MUSE_dictionary(file_path, src_emb, tgt_emb):
    pairs = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            src_word, tgt_word = line.strip().split()
            if src_word in src_emb and tgt_word in tgt_emb:
                pairs.append((src_word, tgt_word))
    return pairs

In [7]:
train_pairs = load_MUSE_dictionary('./MUSE/en-hi.train.txt', en_embeddings, hi_embeddings)
test_pairs = load_MUSE_dictionary('./MUSE/en-hi.test.txt', en_embeddings, hi_embeddings)
print(f"Training pairs: {len(train_pairs)}, Test pairs: {len(test_pairs)}")

Training pairs: 8130, Test pairs: 1600



## 2. Embedding Alignment


### 2.1 Implementing the Procrustes Alignment Method and ensure that the mapping is orthogonal

In [8]:
def procrustes_alignment(train_pairs, src_emb, tgt_emb):
    # Build matrices X (source) and Y (target)
    X = np.array([src_emb[src_word] for src_word, _ in train_pairs]).T  # Shape: (300, n_pairs)
    Y = np.array([tgt_emb[tgt_word] for _, tgt_word in train_pairs]).T  # Shape: (300, n_pairs)
    
    # Compute the mapping
    M = Y @ X.T  # Shape: (300, 300)
    U, _, Vh = np.linalg.svd(M)
    W = U @ Vh  # Orthogonal matrix: (300, 300)
    
    # Verify orthogonality (W @ W.T should be close to identity)
    assert np.allclose(W @ W.T, np.eye(300), atol=1e-6), "W is not orthogonal"
    return W

In [9]:
W = procrustes_alignment(train_pairs, en_embeddings, hi_embeddings)


## 3. Evaluation


### 3.1 Perform word translation from English to Hindi using the aligned embeddings

In [None]:
# This is slow - so I used the faiss library to improve the speed
# def translate_word(src_word, W, src_emb, tgt_emb):
#     if src_word not in src_emb:
#         return []
#     src_vec = W @ src_emb[src_word]  # Map to target space
#     similarities = [(tgt_word, np.dot(src_vec, tgt_vec)) 
#                     for tgt_word, tgt_vec in tgt_emb.items()]
#     similarities.sort(key=lambda x: x[1], reverse=True)
#     return [tgt_word for tgt_word, _ in similarities[:5]]
# translations = {src: translate_word(src, W, en_embeddings, hi_embeddings) 
#                 for src, _ in test_pairs}
# print("Sample translations:", translations[:5])

In [None]:
def build_index(embeddings):
    dim = 300
    index = faiss.IndexFlatIP(dim) 
    vectors = np.array(list(embeddings.values())).astype('float32')
    index.add(vectors)
    return index, list(embeddings.keys())

In [None]:
# Build index for Hindi embeddings
hi_index, hi_words = build_index(hi_embeddings)

def translate_word_faiss(src_word, W, src_emb, index, tgt_words):
    if src_word not in src_emb:
        return []
    src_vec = (W @ src_emb[src_word]).reshape(1, -1).astype('float32')
    distances, indices = index.search(src_vec, 5)
    return [tgt_words[i] for i in indices[0]]

In [None]:
translations = {src: translate_word_faiss(src, W, en_embeddings, hi_index, hi_words) for src, _ in test_pairs}

### 3.2 Evaluate translation accuracy using the MUSE test dictionary  and report Precision@1 and Precision5 metrics for the word transaltion task

In [12]:
def evaluate_translation(test_pairs, translations):
    p1, p5 = 0, 0
    for src, correct_tgts in test_pairs:
        if src not in translations:
            continue
        predicted = translations[src]
        correct_tgts = set(correct_tgts.split()) if isinstance(correct_tgts, str) else {correct_tgts}
        p1 += 1 if predicted[0] in correct_tgts else 0
        p5 += 1 if any(tgt in correct_tgts for tgt in predicted) else 0
    total = len([src for src, _ in test_pairs if src in translations])
    return p1 / total * 100, p5 / total * 100

In [None]:
# Compute metrics
p1, p5 = evaluate_translation(test_pairs, translations)
print(f"Precision@1: {p1:.2f}%, Precision@5: {p5:.2f}%")

Precision@1: 33.12%, Precision@5: 53.12%


### 3.3 Compare and Analyse cosine similarites between word pairs to assess cross-lingual semantic similarity 

In [15]:
def compute_similarity(src_word, tgt_word, W, src_emb, tgt_emb):
    if src_word not in src_emb or tgt_word not in tgt_emb:
        return None
    mapped_vec = W @ src_emb[src_word]
    return np.dot(mapped_vec, tgt_emb[tgt_word])

In [16]:
# True pairs
true_sims = [compute_similarity(src, tgt, W, en_embeddings, hi_embeddings) for src, tgt in test_pairs if compute_similarity(src, tgt, W, en_embeddings, hi_embeddings) is not None]
true_avg = np.mean(true_sims)

# Random pairs
np.random.shuffle(test_pairs)
random_pairs = [(test_pairs[i][0], test_pairs[i+1][1]) for i in range(len(test_pairs)-1)]
random_sims = [compute_similarity(src, tgt, W, en_embeddings, hi_embeddings) for src, tgt in random_pairs if compute_similarity(src, tgt, W, en_embeddings, hi_embeddings) is not None]
random_avg = np.mean(random_sims)

print(f"Avg similarity (true pairs): {true_avg:.4f}, Avg similarity (random pairs): {random_avg:.4f}")

Avg similarity (true pairs): 0.4135, Avg similarity (random pairs): 0.0883


### 3.4 Conduct an ablation study to assess the impact of bilingual lexicon size on alignment quality. Experiment with different training dictionary sizes (e.g., 5k, 10k, 20k word pairs).

In [None]:
sizes = [1000, 2000, 5000]
results = {}

for size in tqdm(sizes):
    sampled_pairs = np.random.choice(len(train_pairs), size, replace=False)
    sampled_train = [train_pairs[i] for i in sampled_pairs]
    W_sampled = procrustes_alignment(sampled_train, en_embeddings, hi_embeddings)
    sampled_translations = {src: translate_word_faiss(src, W_sampled, en_embeddings, hi_index, hi_words) for src, _ in test_pairs}
    p1, p5 = evaluate_translation(test_pairs, sampled_translations)
    results[size] = (p1, p5)

In [38]:
# Report results
for size, (p1, p5) in results.items():
    print(f"Training size {size}: P@1 = {p1:.2f}%, P@5 = {p5:.2f}%")

Training size 1000: P@1 = 18.06%, P@5 = 37.44%
Training size 2000: P@1 = 22.12%, P@5 = 47.44%
Training size 5000: P@1 = 26.88%, P@5 = 52.25%


## 4. Unsupervised alignment method: Cross-Domain Similarity Local Scaling (CSLS)

In [36]:
def compute_csls_scores(src_vec, tgt_vecs, k=10):
    # Convert to numpy for FAISS
    src_vec = np.array(src_vec, dtype=np.float32).reshape(1, -1)
    tgt_vecs = np.array(tgt_vecs, dtype=np.float32)
    
    # Build FAISS index for target vectors
    index = faiss.IndexFlatL2(tgt_vecs.shape[1])
    index.add(tgt_vecs)
    
    # Compute mean similarity to k nearest neighbors in target space
    distances, _ = index.search(tgt_vecs, k)
    r_T = np.mean(distances, axis=1)
    
    # Compute distances from source vector to all target vectors
    distances, indices = index.search(src_vec, len(tgt_vecs))
    distances = distances.flatten()
    indices = indices.flatten()
    
    # CSLS score: 2 * cosine_similarity - r_S - r_T
    src_norm = src_vec / np.linalg.norm(src_vec)
    tgt_norm = tgt_vecs / np.linalg.norm(tgt_vecs, axis=1)[:, None]
    cos_sim = np.dot(src_norm, tgt_norm.T).flatten()
    r_S = np.mean(distances[:k])  # Mean distance to k nearest neighbors of src_vec
    csls_scores = 2 * cos_sim - r_S - r_T
    
    return csls_scores, indices

def translate_word_csls(src_word, W, src_emb, tgt_emb, tgt_words, k=5):
    mapped_vec = W @ src_emb[src_word]
    tgt_vecs = np.array([tgt_emb[word] for word in tgt_words])
    csls_scores, indices = compute_csls_scores(mapped_vec, tgt_vecs)
    top_k_indices = np.argsort(-csls_scores)[:k]
    return [tgt_words[idx] for idx in top_k_indices]

In [37]:
class Discriminator(nn.Module):
    def __init__(self, input_dim=300, hidden_dim=2048):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LeakyReLU(0.2),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        return self.model(x)

def adversarial_training(src_emb, tgt_emb, epochs=50, batch_size=32, lr=0.001):
    # Prepare data
    src_words = list(src_emb.keys())[:50000]  # Use top 50,000 frequent words
    tgt_words = list(tgt_emb.keys())[:50000]
    src_vecs = torch.tensor([src_emb[w] for w in src_words], dtype=torch.float32).to(device)
    tgt_vecs = torch.tensor([tgt_emb[w] for w in tgt_words], dtype=torch.float32).to(device)
    
    # Initialize W (orthogonal matrix)
    W = torch.eye(300, dtype=torch.float32).to(device)
    W.requires_grad = True
    
    # Models and optimizers
    discriminator = Discriminator().to(device)
    optimizer_W = optim.Adam([W], lr=lr)
    optimizer_D = optim.Adam(discriminator.parameters(), lr=lr)
    
    for epoch in tqdm(range(epochs)):
        # Train discriminator (5 steps)
        for _ in range(5):
            idx_src = np.random.choice(len(src_words), batch_size)
            idx_tgt = np.random.choice(len(tgt_words), batch_size)
            src_batch = src_vecs[idx_src]
            tgt_batch = tgt_vecs[idx_tgt]
            mapped_src = src_batch @ W.T
            
            optimizer_D.zero_grad()
            loss_D = nn.BCELoss()(discriminator(tgt_batch), torch.ones(batch_size, 1).to(device)) + \
                     nn.BCELoss()(discriminator(mapped_src.detach()), torch.zeros(batch_size, 1).to(device))
            loss_D.backward()
            optimizer_D.step()
        
        # Train generator (W)
        idx_src = np.random.choice(len(src_words), batch_size)
        src_batch = src_vecs[idx_src]
        mapped_src = src_batch @ W.T
        optimizer_W.zero_grad()
        loss_W = nn.BCELoss()(discriminator(mapped_src), torch.ones(batch_size, 1).to(device))
        loss_W.backward()
        optimizer_W.step()
        
        # Orthogonalization
        with torch.no_grad():
            U, _, V = torch.svd(W)
            W.copy_(U @ V.T)
    
    return W.detach().cpu().numpy()

In [38]:
def extract_synthetic_dictionary(src_emb, tgt_emb, W, top_k=10000):
    src_words = list(src_emb.keys())[:top_k]
    tgt_words = list(tgt_emb.keys())
    translations = {}
    for src_word in src_words:
        translations[src_word] = translate_word_csls(src_word, W, src_emb, tgt_emb, tgt_words, k=1)[0]
    
    # Filter mutual nearest neighbors
    mutual_pairs = []
    W_inv = np.linalg.inv(W)
    for src, tgt in translations.items():
        back_trans = translate_word_csls(tgt, W_inv, tgt_emb, src_emb, src_words, k=1)[0]
        if back_trans == src:
            mutual_pairs.append((src, tgt))
    return mutual_pairs

def refine_with_procrustes(pairs, src_emb, tgt_emb):
    X = np.array([src_emb[src] for src, _ in pairs]).T
    Y = np.array([tgt_emb[tgt] for _, tgt in pairs]).T
    M = Y @ X.T
    U, _, Vh = np.linalg.svd(M)
    return U @ Vh

def refine_mapping(src_emb, tgt_emb, W_init, iterations=5):
    W = W_init
    for _ in range(iterations):
        pairs = extract_synthetic_dictionary(src_emb, tgt_emb, W)
        if not pairs:
            break
        W = refine_with_procrustes(pairs, src_emb, tgt_emb)
    return W

In [None]:
def evaluate_mapping(W, src_emb, tgt_emb, test_pairs, tgt_words, k_values=[1, 5]):
    correct = {k: 0 for k in k_values}
    for src, true_tgt in test_pairs:
        pred_tgts = translate_word_csls(src, W, src_emb, tgt_emb, tgt_words, k=max(k_values))
        for k in k_values:
            if true_tgt in pred_tgts[:k]:
                correct[k] += 1
    precision = {k: correct[k] / len(test_pairs) for k in k_values}
    return precision


tgt_words = list(hi_embeddings.keys())

# Unsupervised method
W_adv = adversarial_training(en_embeddings, hi_embeddings)
W_unsupervised = refine_mapping(en_embeddings, hi_embeddings, W_adv)
precision_unsupervised = evaluate_mapping(W_unsupervised, en_embeddings, hi_embeddings, test_pairs, tgt_words)



print("Unsupervised Precision:", precision_unsupervised)


  src_vecs = torch.tensor([src_emb[w] for w in src_words], dtype=torch.float32).to(device)
100%|██████████| 50/50 [00:09<00:00,  5.21it/s]



## Remarks