# Similar words generation

This notebook generates german similar words based on semantic equivalence with the help of [spaCy](https://spacy.io/).

**Prerequisites:** Run the Generate data notebook.

**Input:** Provide a `data-import/in-words.tsv` file holding the target words.

**Output:** A list of similar words stored in `data-generated/similar-words.tsv`. Should also optionally be quality controlled to improve the data set.

In [None]:
%pip install pandas textdistance tqdm spacy

In [None]:
import pandas as pd
from tqdm.notebook import tqdm
import spacy

In [None]:
%%sh
python -m spacy download de_core_news_lg

In [None]:
wanted_languages = ["DE"]
# load the large model which also includes word embeddings
nlp = spacy.load("de_core_news_lg")
df_words = pd.read_csv("data-generated/vocabulary.tsv", sep="\t")
counts = df_words.language.value_counts()

In [None]:
# map of (language, word length) -> set[word]
words_by_lang_length = {}

print("Loading all word embeddings for desired languages.")

# Preloading all embeddings takes some time. However, it significantly reduces time when comparing embeddings to each other
# as we otherwise reload same words over and over again.
for lang, grp in df_words.groupby("language"):
    if lang not in wanted_languages: continue

    print(f"Loading {lang} words")

    for word in tqdm(grp.word.astype(str)):
        word_len = len(word)
        if word_len < 3: continue
        words_by_lang_length.setdefault((lang, word_len), set()).add(nlp(word))

In [None]:
def find_similar_words(
        language,
        source_word,
        length_difference=10,
        semantical_similarity_threshold=0.7,
        stop_after=10,
        debug_print=False
):
    similar_words = set()
    source_len = len(source_word)
    source_word = nlp(source_word)

    for target_len in range(max(source_len - length_difference, 3), source_len + length_difference + 1):
        candidate_words = words_by_lang_length.get((language, target_len), set())

        if debug_print:
            print(f"checking {target_len}. Current state: {similar_words}. Threshold: {semantical_similarity_threshold}")

        for candidate_word in candidate_words:

            s_raw = source_word.text
            c_raw = candidate_word.text

            # make sure we don't select the same word just with different capitalization
            if s_raw.lower() == c_raw.lower():
                continue

            # Skip similar words which include each other
            if s_raw.lower() in c_raw.lower() or c_raw.lower() in s_raw.lower():
                continue

            # make sure both words have the same capitalization
            if (s_raw[0].isupper() and not c_raw[0].isupper()) or (s_raw[0].islower() and not c_raw[0].islower()):
                continue

            sw = source_word
            cw = candidate_word

            # make sure we use the same part of speech or tag
            if sw[0].pos_ != cw[0].pos_ or sw[0].tag_ != cw[0].tag_:
                continue

            # skip same lemmas. For "gehe" we should not get "gehst" as similar word.
            if sw[0].lemma_ == cw[0].lemma_:
                continue

            # semantical similarity check
            if sw.similarity(cw) < semantical_similarity_threshold:
                continue

            similar_words.add(c_raw)

            if 0 < stop_after <= len(similar_words):
                return similar_words

    if  len(similar_words) < stop_after and semantical_similarity_threshold > 0:
        if debug_print:
            print("Too few results, retrying and loweing threshold by -0.05")

        return find_similar_words(language,
                                  source_word,
                                  length_difference,
                                  semantical_similarity_threshold=semantical_similarity_threshold-0.05,
                                  stop_after=stop_after,
                                  debug_print=debug_print
                                  )
    else:
        return similar_words

In [None]:
words = pd.read_csv("data-import/in-words.tsv", sep="\t").word

print(f"Finding similar words for {wanted_languages}")

results = []

for source_word in tqdm(words):
    similar_words = find_similar_words(
        "DE",
        source_word,
        length_difference=10,
        semantical_similarity_threshold=0.7,
        stop_after=5,
        debug_print=False
    )
    print(f"For '{source_word}' found {similar_words}")
    out = {
        "word": source_word,
    }


    for i, w in enumerate(list(similar_words)):
        out[f"similar_{i}"] = w

    results.append(out)

df = pd.DataFrame(results)
df.to_csv("data-generated/similar-words.tsv", sep="\t", index=False)
df