In [1]:
from pathlib import Path
import pandas as pd

In [2]:
path_words_gf = Path("../data/wikimini/Words.gf")
path_ar_tsv = Path("../data/raw/arabic.tsv")

In [3]:
pos_neglect = set(["PN", "LN", "SN", "GN"])

In [4]:
def tsv_vs_gf_ar(
    tsv_path: Path, gf_path: Path, pos_neglect: set = set()
) -> pd.DataFrame:
    """Compares TSV file with GF file and extracts records present in both.

    This function reads records from a TSV file and checks for their presence in a GF file.  Only records that are found in both
    files, and are not part of the pos_neglect set, are retained.  The results are returned as a Pandas DataFrame with columns:
    'ar', 'en', and an index 'li' which is the line number of the TSV file.

    Args:
        tsv_path (Path): The path to the TSV file that contains the records to be checked.  
        gf_path (Path): The path to the GF file which serves as a reference for comparison.  
        pos_neglect (set, optional): Set of POS (parts-of-speech) tags to be neglected when comparing records. Defaults to an
        empty set.

    Returns:
        pd.DataFrame: A dataframe containing records ('ar', 'en') that are present in both the TSV and GF files.  The dataframe
        is indexed by 'li', which represents the line number in the original TSV file.
    """
    dict_tsv = {"ar": [], "en": [], "li": []}

    # Read files, fg and tsv
    with open(tsv_path) as tsv_obj:
        list_tsv = tsv_obj.readlines()
    with open(gf_path) as gf_obj:
        set_gf = set(map(lambda x: x.strip()[:-1], gf_obj.readlines()[2:-1]))

    # save records that are found in both gf and tsv
    for i, line_tsv in enumerate(list_tsv):
        en_token, _, ar_words, _ = line_tsv.split("\t")
        _, pos = en_token.rsplit("_", 1)
        if (pos in pos_neglect) or (not en_token in set_gf):
            continue

        dict_tsv["en"].append(en_token)
        dict_tsv["ar"].append(ar_words)
        dict_tsv["li"].append(i)

    return pd.DataFrame(dict_tsv).set_index("li").sort_index()

In [5]:
df_ar_en_words = tsv_vs_gf_ar(path_ar_tsv, path_words_gf, pos_neglect)

In [6]:
df_ar_en_words.head(10)

Unnamed: 0_level_0,ar,en
li,Unnamed: 1_level_1,Unnamed: 2_level_1
292,مطلق,absolute_3_A
1168,إِدارِيّ,administrative_A
1558,أفْغانِيّ,afghani_1_N
1596,لغة أفريكانية,afrikaans_N
1643,سِنّ,age_1_N
1973,الأكانية,akan_N
1987,ألاباما,alabama_4_N
2019,الألبانية,albanian_2_N
2085,الأليوتية,aleut_N
2657,الأمهرية,amharic_N


In [7]:
print(f"Number of Arabic words found in Words.gf: {df_ar_en_words.shape[0]}")

Number of Arabic words found in Words.gf: 479


In [23]:
df_ar_en_words.to_csv("../results/ar2en_words_gf.csv", sep="\t")

In [12]:
def sp(text: str):
    w, ps = text.rsplit("_", 1)
    if w[-1].isdecimal():
        ws, _ = w.rsplit("_", 1)
        return " ".join(ws.split("_"))
    return " ".join(w.split("_"))

In [14]:
df_ar_en_words["en_"] = df_ar_en_words["en"].apply(sp)

In [15]:
df_ar_en_words.to_csv("../results/ar2en_words_gf_.csv", sep="\t")