In [11]:
import os
import re
import joblib
import zipfile
import pandas as pd
from tqdm.notebook import tqdm

In [90]:
# Suppress warning
pd.options.mode.chained_assignment = None

Clone idiomata_cognitor

In [None]:
if not os.path.exists("idiomata_cognitor/model.pkl"):
    if not os.path.exists("idiomata_cognitor"):
        os.system('git clone https://github.com/transducens/idiomata_cognitor.git')
    with zipfile.ZipFile("idiomata_cognitor/model.zip", 'r') as zip_ref:
        zip_ref.extractall("idiomata_cognitor")
else:
    print("Idiomata_cognitor already exists")

Load model for language classification

In [3]:
labels = { 
    1.: 'Spanish',
    2.: 'Catalan',
    3.: 'Aragonese',
    4.: 'Aranese',
    5.: 'Occitan',
    6.: 'Asturian',
    7.: 'Galician',
    8.: 'Italian',
    9.: 'French',
    10.: 'Portuguese'
}
clf2 = joblib.load("idiomata_cognitor/model.pkl")

Define utility functions

In [92]:
def read_data(src_path, trg_path):
    with open(src_path, 'r', encoding='utf-8') as file:
        input_texts = file.readlines()
        input_texts = [line.strip() for line in input_texts]
    with open(trg_path, 'r', encoding='utf-8') as file:
        references = file.readlines()
        references = [line.strip() for line in references]
    return input_texts, references

def remove_punctuations_and_spaces(text: str) -> str:
    # Define the regular expression pattern to match punctuation
    punctuation_pattern = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
    
    # Remove punctuation from the text
    text_without_punctuations = re.sub(punctuation_pattern, '', text)
    
    # Define the regular expression pattern to match spaces
    spaces_pattern = r'\s+'
    
    # Remove spaces from the text
    cleaned_text = re.sub(spaces_pattern, '', text_without_punctuations).lower()
    
    return cleaned_text

def deduplicate_df(df: pd.DataFrame, col: str) -> pd.DataFrame:
    df[f"processed_{col}"] = df[col].apply(remove_punctuations_and_spaces)
    df.drop_duplicates(subset=[col],keep="first",inplace=True)
    df.drop(labels=[f"processed_{col}"], inplace=True, axis=1)

    return df

def language_filteration(df: pd.DataFrame, cols: list, lang_ids: list, step: int = 10_000) -> pd.DataFrame:
    preds = {}
    for col in cols:
        preds[col] = []
    for start_idx in tqdm(range(0, len(df), step)):
        for col in cols:
            preds[col].extend(clf2.predict(df[col][start_idx: start_idx + step].values.astype('U')))
    
    mask = pd.Series([True] * len(df))
    for i, (col, pred_ids) in enumerate(preds.items()):
        lang_id = lang_ids[i]
        mask &= (pd.Series(pred_ids) == lang_id)
    filtered_df = df[mask]

    return filtered_df

def dev_stat_filteration(df: pd.DataFrame, df_dev: pd.DataFrame, src_col: str, trg_col: str) -> pd.DataFrame:
    df['src_len'] = df[src_col].apply(lambda x: len(x.split()))
    df['trg_len'] = df[trg_col].apply(lambda x: len(x.split()))
    df['len_ratio'] = df['src_len'] / df['trg_len']
    df_dev['src_len'] = df_dev['src'].apply(lambda x: len(x.split()))

    # NOTE: The df_dev len_ratio was predetermined and adjusted to give room for error.
    filtered_df = df[(df["src_len"] < df_dev['src_len'].max()) & (df["len_ratio"] >= 0.7) & (df["len_ratio"] <= 1.9)]
    filtered_df.drop(labels=["src_len", "trg_len", "len_ratio"], inplace=True, axis=1)

    return filtered_df


def primary_filteration(df: pd.DataFrame, df_dev: pd.DataFrame, cols: list, lang_ids: list) -> pd.DataFrame:
    print("Deduplicating...")
    df1 = deduplicate_df(df, cols[0])
    print("Performing language filteration...")
    df2 = language_filteration(df1, cols, lang_ids)
    print("Performing statistical filteration...")
    df3 = dev_stat_filteration(df2, df_dev, cols[0], cols[1])
    print("Done!")

    return df3

Load dev set

In [None]:
es_dev_path = 'data/dev/dev.spa_Latn'
oc_dev_path = 'data/dev/dev.ast_Latn'
src_text, trg_text = read_data(es_dev_path, oc_dev_path)
df_dev = pd.DataFrame(data={"src": src_text, "trg": trg_text})

Filter datasets

In [None]:
df_wikimedia = pd.read_csv("es-ast/wikimedia/wikimedia.csv")
print("Start size:", len(df_wikimedia))
filtered_df_wikimedia = primary_filteration(df_wikimedia, df_dev, ["es", "ast"], [1, 6])
print("Final size:", len(filtered_df_wikimedia))
filtered_df_wikimedia.to_csv("filtered_wikimedia.csv", index=False)

In [None]:
df_nllb = pd.read_csv("es-ast/nllb/nllb.csv")
df_nllb = df_nllb.astype({'es': 'str', "ast": "str"})
print("Start size:", len(df_nllb))
filtered_df_nllb = primary_filteration(df_nllb, df_dev, ["es", "ast"], [1, 6])
print("Final size:", len(filtered_df_nllb))
filtered_df_nllb.to_csv("filtered_nllb.csv", index=False)