# Generate the pairs for Undercover

In [None]:
import fasttext.util
import pandas as pd
import numpy as np
import fasttext
import random
import faiss
import spacy
import json
import re

nlp = spacy.load("fr_core_news_lg")
fasttext.util.download_model('fr', if_exists='ignore')
ft = fasttext.load_model('cc.fr.300.bin')

In [None]:
def family_root(lemme, cgram):
    base = lemme.lower()
    
    if cgram == 'NOM':
        return base
    
    if cgram == 'VER':
        base = re.sub(r'(er|ir|re)$', '', base)
        return base
    
    if cgram == 'ADJ':
        base = re.sub(r'(Ã©|Ã©e|i|ant)$', '', base)
        return base
    
    return base

def are_similar(word1, word2, threshold=0.8):
    if not word1.has_vector or not word2.has_vector:
        return False
    return word1.similarity(word2) > threshold

## Word dataset

In [None]:
df = pd.read_csv("data/lexique.tsv", sep='\t')

# Keep relevant columns
columns_to_keep = ['lemme', 'cgram', 'freqlemfilms2', 'freqlemlivres']
df = df[columns_to_keep]

# Merge frequencies
df['freq'] = df['freqlemfilms2'] + df['freqlemlivres']
df = df.drop(columns=['freqlemfilms2', 'freqlemlivres'])

# Keep only one line per lemme with the biggest freq_total
df = df.loc[df.groupby('lemme')['freq'].idxmax()]

# Keep values we like
df = df[df['freq'] >= 10]
df = df[df['lemme'].str.len() >= 3]
df = df[df['cgram'].isin(['NOM', 'ADJ'])]

# Remove words from the same family
df['root'] = df.apply(lambda row: family_root(row['lemme'], row['cgram']), axis=1)
priority = {'NOM': 0, 'ADJ': 1}
df['priority'] = df['cgram'].map(priority)
df = df.sort_values(by=['root', 'priority', 'freq'], ascending=[True, True, False])
df = df.drop_duplicates(subset=['root'], keep='first')
df = df.reset_index(drop=True) 

In [None]:
words = df['lemme'].tolist()
pairs_file = 'app/src/main/assets/pairs.txt'

# Get embeddings
embeddings = np.array([ft.get_word_vector(w) for w in words], dtype='float32')

# Normalize embeddings (important for cosine similarity with FAISS)
embeddings /= np.linalg.norm(embeddings, axis=1, keepdims=True)

# Build FAISS index
dim = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)  # inner product ~ cosine similarity after normalization
index.add(embeddings)

# Search for top k neighbors for each word
k = 300  # adjust based on how many "similar" words you want per word
D, I = index.search(embeddings, k)  # D=similarities, I=indices of neighbors

In [None]:
# Save filtered pairs
threshold_min = 0.43  # not too unrelated

with open("app/src/main/assets/pairs.json", "w", encoding="utf-8") as f:
    for i, word in enumerate(words):
        for j, sim in zip(I[i], D[i]):
            if i >= j:  # avoid duplicate pairs and self-pair
                continue
            if threshold_min < sim:
                # Only write if cgram is the same (type of word)
                if df.loc[i, 'cgram'] == df.loc[j, 'cgram']:
                    pair = [word.capitalize(), words[j].capitalize()]
                    f.write(json.dumps(pair, ensure_ascii=False) + "\n")

## Test

In [None]:
x = 10  # number of random pairs to print

with open("app/src/main/assets/pairs.json", "r", encoding="utf-8") as f:
    pairs = [json.loads(line) for line in f]

random_pairs = random.sample(pairs, min(x, len(pairs)))
print(f"{x} random pairs out of {len(pairs):,}:")
max_len = max(len(pair[0]) for pair in random_pairs) + 5

for pair in random_pairs:
    print(f"{pair[0]:<{max_len}}{pair[1]}")