In [None]:
from tqdm import tqdm
import pandas as pd
import random
import spacy
import re

tqdm.pandas()
nlp = spacy.load("fr_core_news_lg")

In [None]:
def family_root(lemme, cgram):
    base = lemme.lower()
    
    if cgram == 'NOM':
        return base  # keep noun as-is
    
    # Remove common verb endings
    if cgram == 'VER':
        # infinitive endings
        base = re.sub(r'(er|ir|re)$', '', base)
        return base
    
    # Remove participle/adjective endings
    if cgram == 'ADJ':
        base = re.sub(r'(é|ée|i|ant)$', '', base)
        return base
    
    return base

## Word dataset

In [None]:
df = pd.read_csv("lexique.tsv", sep='\t')

# Delete useless columns
columns_to_keep = ['lemme', 'cgram', 'freqlemfilms2', 'freqlemlivres']
df = df[columns_to_keep]

# Replace freq columns by one wich is the addition of the two
df['freq'] = df['freqlemfilms2'] + df['freqlemlivres']
df = df.drop(columns=['freqlemfilms2', 'freqlemlivres'])

# Keep only one line per lemme with the biggest freq_total
df = df.loc[df.groupby('lemme')['freq'].idxmax()]

# Delete all lines with low freq
df = df[df['freq'] >= 10]

# Remove small word
df = df[df['lemme'].str.len() >= 3]

# Keep only verbs, nouns and adjectives
df = df[df['cgram'].isin(['VER', 'NOM', 'ADJ'])]

# Remove words from the same family
df['root'] = df.progress_apply(lambda row: family_root(row['lemme'], row['cgram']), axis=1)

# Sort by cgram priority and freq
priority = {'NOM': 0, 'VER': 1, 'ADJ': 2}
df['priority'] = df['cgram'].map(priority)
df = df.sort_values(by=['root', 'priority', 'freq'], ascending=[True, True, False])

# Keep only the first occurrence of each root
df = df.drop_duplicates(subset=['root'], keep='first')

# Clean up
df = df.drop(columns=['root', 'priority']).reset_index(drop=True)

In [None]:
pairs = []

def are_similar(word1, word2, threshold=0.5):
    return word1.similarity(word2) > threshold

words = df['word'].tolist()

for i in range(len(words)):
    token1 = nlp(words[i])
    for j in range(i + 1, len(words)): 
        token2 = nlp(words[j])
        if are_similar(token1, token2):
            pairs.append(f"{words[i]} - {words[j]}")

random.shuffle(pairs)

with open("pairs.txt", "w", encoding="utf-8") as f:
    for pair in pairs:
        f.write(pair + "\n")

print(f"Generated {len(pairs)} word pairs.")