# Generate data: Tatoeba
This script generates clean and filtered data from data sources. The only data source currently is Tatoeba.

Outputs are TSV files in the `data-generated/` folder.

In [None]:
%pip install pandas tqdm spacy

In [None]:
import csv
import gc
import re

import pandas as pd
from tqdm.notebook import tqdm

In [None]:
%%sh
mkdir data-generated

## Inputs

In [None]:
# Which sentences to keep by language.
# If no sentence pairs are specified (below), all sentences for the given languages are kept. This does not guarantee that all sentences will have translations within the kept languages.
# This variable maps 3-letter ISO language codes used by Tatoeba to 2-letter used by Taskbase.
WANTED_LANGS = dict(
    deu="DE",
    ukr="UK",
)

# Optional. Whitelists of sentence pairs to keep.
# This is useful if you are generating translation exercises and want to keep only sentences that are guaranteed to have a translation.
# All languages appearing in these pair files must be present in WANTED_LANGS.
# If this list is empty, all sentences having the languages in WANTED_LANGS will be included.
# The files should be tab-separated with these columns: sentence 1 ID, sentence 1 text, sentence 2 ID, sentence 2 text.
# Download Tatoeba sentence pair files from https://tatoeba.org/en/downloads
SENTENCE_PAIR_FILES = [
    "./data-tatoeba/sentences_uk_de.tsv"
]

In [None]:
def build_tatoeba_sentences():
    print("Tatoeba: building sentences...")

    # Get the master list of sentences.
    df_sentences = pd.read_csv("./data-tatoeba/sentences_detailed.csv", sep="\t",
                               usecols=range(4), names=["id", "language", "text", "author"],
                               index_col="id", quoting=csv.QUOTE_NONE)
    df_sentences = df_sentences.dropna()

    # Filter only wanted sentences
    if len(SENTENCE_PAIR_FILES) > 0:
        # If there are SENTENCE_PAIR_FILES, intersect those sentences with the master list. We need the intersection because the master list includes sentence metadata.
        # Sentence pair files and the master list are updated at different times. Sometimes, sentences may be present in the pair files that aren't in the master list. If this happens as is important, make sure to re-download up-to-date datasets.
        print("Filtering sentences by wanted pairs...")
        ids_to_keep = set()
        for pair_file in SENTENCE_PAIR_FILES:
            df_pairs = pd.read_csv(pair_file, sep="\t", names=["id1", "text1", "id2", "text2"])
            for id1, id2 in zip(df_pairs.id1, df_pairs.id2):
                # Make sure both sentences are in the master list
                if id1 in df_sentences.index and id2 in df_sentences.index:
                    ids_to_keep.update([id1, id2])
        df_sentences = df_sentences[df_sentences.index.isin(ids_to_keep)]
    else:
        # If no pair files are given, retain only the wanted languages.
        print(" Filtering sentences by language...")
        df_sentences = df_sentences[df_sentences.language.isin(WANTED_LANGS.keys())]

    print("Replacing 'smart' quotes")
    re_quote = re.compile("[\u201c\u201d\u201e\u201f]")
    def mapper(text):
        return re_quote.sub("\"", text)
    df_sentences.text = df_sentences.text.map(mapper)

    # Map language codes
    df_sentences.language = df_sentences.language.map(WANTED_LANGS)

    print(f"There are {len(df_sentences)} sentences")

    # Populate the "translated_from" field, for completeness.
    # -1 means the base sentences was marked as null in the source dataset.
    # -2 means the base sentence was missing in the source dataset.
    # 0 means this is the root text.
    print("Determining base sentences...")
    csv_base = pd.read_csv("./data-tatoeba/sentences_base.csv", sep="\t",
                           names=["id", "translated_from"], index_col="id",
                           dtype={"translated_from": "object"})
    s_translated_from = csv_base.translated_from.replace("\\N", -1).astype("int64")
    df_sentences = df_sentences.join(s_translated_from, how="left")
    df_sentences["translated_from"] = df_sentences.translated_from.fillna(-2)

    # Count words
    print("Counting words...")
    r_whitespace = re.compile(r"\s+")
    # r_letter = re.compile("[A-Za-z]")
    def get_word_count(string):
        # TODO: How best to split words in different languages? Should non-word tokens be ignored?
        return len([x for x in r_whitespace.split(string)])
        # return len([x for x in r_whitespace.split(string) if r_letter.search(x) is not None])
    s_word_count = df_sentences.text.map(get_word_count)
    df_sentences["word_count"] = s_word_count

    # Write out
    df_sentences.to_csv("data-generated/sentences.tsv", sep="\t")
    return df_sentences

df_sentences = build_tatoeba_sentences()
gc.collect()

## Create translation relations

In [None]:
def build_translations(sentence_ids, dedup=True):
    translations = []

    def is_valid_link(s1, s2):
        if dedup: return s1 in sentence_ids and s2 in sentence_ids and s1 < s2
        else: return s1 in sentence_ids and s2 in sentence_ids

    # Reading the links file manually is much faster than with Pandas
    with open("./data-tatoeba/links.csv", "r") as f:
        for line in f:
            s1, s2 = [int(x.strip()) for x in line.strip().split("\t")]
            if is_valid_link(s1, s2): translations.append((s1, s2))

    df_translations = pd.DataFrame(translations, columns=["s1", "s2"])
    df_translations.to_csv("data-generated/translations.tsv", sep="\t", index=False)

build_translations(set(df_sentences.index))
gc.collect()

## Create vocabulary

In [None]:
def build_vocabulary():
    import re
    r_nonalpha = re.compile(r"[^\w\-']")
    r_whitespace = re.compile(r"\s+")

    word_links = set()
    n_word = 0
    v2id = {}

    for _, sentence in df_sentences.iterrows():
        sentence_id = sentence.name

        # Split words
        words = [r_nonalpha.sub("", x) for x in r_whitespace.split(sentence.text)]

        for word in words:
            v = (sentence.language, word)
            if v not in v2id:
                v2id[v] = n_word
                n_word += 1
            word_links.add((sentence_id, v2id[v]))

    # Save sentence-vocabulary table
    pd.DataFrame(word_links, columns=["sentence_id", "vocabulary_id"])\
            .to_csv("data-generated/sentence-vocabulary.tsv", sep="\t", index=False)

    # Save vocabulary table
    vocabs = v2id.keys()
    pd.DataFrame([{"language": language, "word": word, "length": len(word)} for language, word in vocabs],
                 index=v2id.values())\
            .rename_axis("id")\
            .to_csv("data-generated/vocabulary.tsv", sep="\t", index=True)

build_vocabulary()
gc.collect()

## Lemmatize

**NOTE:** This part is yet unused in the exercise generations and can hence also be skipped.

Make sure to include spaCy models for all languages

In [None]:
%%sh
python -m spacy download de_dep_news_trf
python -m spacy download en_core_web_trf
python -m spacy download fr_dep_news_trf
python -m spacy download uk_core_news_trf

python -m spacy download de_core_news_sm
python -m spacy download en_core_web_sm
python -m spacy download fr_core_news_sm
python -m spacy download uk_core_news_sm

In [None]:
# Build on the vocabulary table
# Columns: id, language, word, length
df_sentences = pd.read_csv("data-generated/sentences.tsv", sep="\t")

In [None]:
import spacy

# Lightweight models.
nlp_models = {
    "DE": spacy.load("de_core_news_sm"),
    "EN": spacy.load("en_core_web_sm"),
    "FR": spacy.load("fr_core_news_sm"),
    "UK": spacy.load("uk_core_news_sm"),
}

# Transformer models. Quite slow.
# nlp_models = {
#     "DE": spacy.load("de_dep_news_trf"),
#     "EN": spacy.load("en_core_web_trf"),
#     "FR": spacy.load("fr_dep_news_trf"),
#     "UK": spacy.load("uk_core_news_trf"),
# }

In [None]:
def build_lemmata():
    links = set() # [(sentence_id, lemma_id)]
    n_lemma = 0
    lemma2id = {}

    working_df = df_sentences#[df_sentences.language == "EN"].iloc[0:100]
    for _, sentence in tqdm(working_df.iterrows(), total=len(working_df)):
        tokens = nlp_models[sentence.language](sentence.text)

        for token in tokens:
            if not token.is_alpha: continue

            lemma = (sentence.language, token.lemma_.lower())
            if lemma not in lemma2id:
                lemma2id[lemma] = n_lemma
                n_lemma += 1
            links.add((sentence.id, lemma2id[lemma]))

    # TODO: Filter short or unwanted lemmata?

    # Save sentence-lemma table
    links_df = pd.DataFrame(links, columns=["sentence_id", "lemma_id"])
    links_df.to_csv("data-generated/sentence-lemma.tsv", sep="\t", index=False)

    # Save vocabulary table
    lemmata = lemma2id.keys()
    lemmata_df = pd.DataFrame([{"language": language, "word": word} for language, word in lemmata],
                 index=lemma2id.values())\
            .rename_axis("id")
    lemmata_df.to_csv("data-generated/lemmata.tsv", sep="\t", index=True)

    return lemmata_df, links_df

lemmata_df, links_df = build_lemmata()
lemmata_df

In [None]:
del nlp_models
gc.collect()

In [None]:
del df_sentences
gc.collect()
