# Language Data Collection

Here I collect data on the languages in the FLORES+ dataset.

In [2]:
import pandas as pd

## Initializing Language Data

The FLORES+ dataset can be downloaded [here](https://github.com/openlanguagedata/flores).

In [None]:
from flores import Flores

In [None]:
flores = Flores("../floresp-v2.0-rc.2/")

In [None]:
langs = sorted(flores.langs["dev"])
codes = [lang.split("_")[0] for lang in langs]
scripts = [lang.split("_")[1] for lang in langs]

lang_data = pd.DataFrame({"lang": langs, "iso_code": codes, "script": scripts})

In [None]:
df_sheet = pd.read_csv("flores_langs.csv")
lang_data["variety"] = df_sheet["variety"]
lang_data["name"] = df_sheet["name"]

In [None]:
lang_data.to_csv("lang_data.csv", index=False)

## Estimated Portion in Training Data

In [None]:
lang_data = pd.read_csv("lang_data.csv")

In [None]:
train_estimates = (
    pd.read_csv("lang_train_size.csv")
    [["lang", "fraction_total"]]
)

In [None]:
lang_data = (
    lang_data.merge(train_estimates, on="lang", how="left")
    .rename(columns={"fraction_total": "train_frac_estimate"})
)

In [None]:
lang_data["train_frac_estimate"] = lang_data["train_frac_estimate"].fillna(0)

In [None]:
lang_data.to_csv("lang_data.csv", index=False)

## WALS Codes

WALS data can be found [here](https://github.com/cldf-datasets/wals).

In [None]:
wals_langs = pd.read_csv("wals_langs.csv")[:2662]

In [None]:
wals_codes = []

for iso in lang_data["iso_code"]:
    wals_code = wals_langs.loc[wals_langs["ISO639P3code"] == iso, "ID"].values
    wals_codes.append(list(wals_code))

wals_codes = [code[0] if len(code) == 1 else code for code in wals_codes]

In [None]:
lang_data["wals_code"] = wals_codes

In [None]:
lang_data.to_csv("lang_data.csv", index=False)

## WALS Family & Genus

In [None]:
lang_data = pd.read_csv("lang_data.csv")

In [None]:
wals_langs = (
    pd.read_csv("wals_langs.csv")[:2662]
    .rename(columns={"ID": "wals_code", "Family": "family", "Genus": "genus"})
)

In [None]:
genus_family = wals_langs[["wals_code", "family", "genus"]]
lang_data = lang_data.merge(genus_family, on="wals_code", how="left")

In [None]:
lang_data.to_csv("lang_data.csv", index=False)

## WALS Morphological Features

In [None]:
lang_data = pd.read_csv("lang_data.csv")
wals_feats = pd.read_csv("wals_feats.csv")

In [None]:
# Keep only languages from lang_data
wals_feats = wals_feats[wals_feats["Language_ID"].isin(lang_data["wals_code"])]

In [None]:
# Keep only morphological features (as per Park et al.)
MORPH_FEATS = ["20A", "21A", "21B", "22A", "23A", "24A", "25A", "25B", "26A",
               "27A", "28A", "29A"]

wals_feats = wals_feats[wals_feats["Parameter_ID"].isin(MORPH_FEATS)]

In [None]:
# Add morphological features to lang_data
for idx, row in lang_data.iterrows():
    wals_code = row["wals_code"]
    if isinstance(wals_code, float): continue  # Skip rows without a WALS code
    for feat in MORPH_FEATS:
        value_id = f"{feat}-{wals_code}"
        values = wals_feats.loc[wals_feats["ID"] == value_id]["Value"].values
        value = values[0] if values else None
        lang_data.loc[idx, feat] = value

## Complexity Measures

[Paper](https://www.degruyter.com/document/doi/10.1515/lingvan-2021-0007/html?lang=en#j_lingvan-2021-0007_ref_059) | [Repository](https://github.com/coltekin/mcomplexity)

In [None]:
lang_data = pd.read_csv("lang_data.csv")

In [None]:
compl_scores = (
    pd.read_csv("complexity-scores.txt", sep="\t")
    .rename(columns={"WALS_code": "wals_code"})
    .drop(columns=["treebank"])
    # Average complexity scores for languages with multiple treebanks
    .groupby("wals_code", as_index=False).mean()
)

In [None]:
lang_data = lang_data.merge(compl_scores, on="wals_code", how="left")

In [None]:
lang_data.to_csv("lang_data.csv", index=False)

## Type-Token Ratio on FLORES

In [None]:
from lexicalrichness import LexicalRichness

In [None]:
lang_data = pd.read_csv("lang_data.csv")

In [None]:
ttr_flores = []

for lang in lang_data["lang"]:
    filename = f"../floresp-v2.0-rc.2/dev/dev.{lang}"
    with open(filename, encoding="utf-8") as f:
        text = f.read()
        lex = LexicalRichness(text)
        ttr_flores.append(lex.ttr)

In [None]:
lang_data["ttr_flores"] = ttr_flores

In [None]:
lang_data.to_csv("lang_data.csv", index=False)

## Syntactic Distances

In [None]:
git clone https://github.com/antonisa/lang2vec
cd lang2vec
python3 setup.py install

In [1]:
import lang2vec.lang2vec as l2v

In [13]:
lang_data = pd.read_csv("lang_data.csv")

In [18]:
iso_codes = lang_data["iso_code"].tolist()
d_syn = [l2v.distance("syntactic", "eng", iso_code) for iso_code in iso_codes]
lang_data["d_syn"] = d_syn

In [23]:
lang_data.to_csv("lang_data.csv", index=False)

## Word Order Entropy

In [10]:
entropy_data = pd.read_csv("word_order_entropy.csv")

In [39]:
lang_data = pd.read_csv("lang_data.csv")

In [23]:
# Mapping of ISO 639-1 codes to ISO 639-3 codes
ISO_MAP = {"af": "afr", "ar": "ara", "be": "bel", "bg": "bul", "bxr": "bxr",
           "ca": "cat", "cop": "cop", "cs": "ces", "cu": "chu", "da": "dan",
           "de": "deu", "el": "ell", "en": "eng", "es": "spa", "et": "est",
           "eu": "eus", "fa": "fas", "fi": "fin", "fr": "fra", "ga": "gle",
           "gl": "glg", "got": "got", "grc": "grc", "he": "heb", "hi": "hin",
           "hr": "hrv", "hsb": "hsb", "hu": "hun", "id": "ind", "it": "ita",
           "ja": "jpn", "kk": "kaz", "kmr": "kmr", "ko": "kor", "la": "lat",
           "lt": "lit", "lv": "lav", "mr": "mar", "nl": "nld", "no": "nor",
           "pl": "pol", "pt": "por", "ro": "ron", "ru": "rus", "sa": "san",
           "sk": "slk", "sl": "slv", "sme": "sme", "sr": "srp", "sv": "swe",
           "swl": "swl", "ta": "tam", "te": "tel", "tr": "tur", "ug": "uig",
           "uk": "ukr", "ur": "urd", "vi": "vie", "yue": "yue", "zh": "zho"}

In [None]:
# Dependencies
DEP_COLS = ["adp_Noun", "aux_Verb", "nsubjNOUN_Pred", "nsubjPRON_Pred",
            "objNOUN_Pred", "objPRON_Pred", "oblNOUN_Pred", "oblPRON_Pred",
            "nmodNOUN_Noun", "nmodPRON_Noun", "amod_Noun", "advmod_Verb",
            "advmod_Adj", "cop_Pred", "nummod_Noun", "det_Noun", "advcl_Main",
            "acl_Noun", "ccomp_Main", "csubj_Main", "mark_advcl", "mark_ccomp"]

# Codependencies
CODEP_COLS = ["nsubj_obj", "obj_obl"]

In [19]:
entropy_data["iso_code"] = entropy_data["Lang"].map(ISO_MAP)

In [None]:
h_dep = entropy_data[[*DEP_COLS, "iso_code"]]  # Dependency entropy
h_codep = entropy_data[[*CODEP_COLS, "iso_code"]]  # Codependency entropy

h_dep["h_dep_avg"] = h_dep.mean(axis=1)
h_codep["h_codep_avg"] = h_codep.mean(axis=1)

In [48]:
lang_data = lang_data.merge(h_dep[["iso_code", "h_dep_avg"]], on="iso_code", how="left")
lang_data = lang_data.merge(h_codep[["iso_code", "h_codep_avg"]], on="iso_code", how="left")

In [51]:
lang_data.to_csv("lang_data.csv", index=False)

## Column Reordering

In [52]:
lang_data = pd.read_csv("lang_data.csv")

In [54]:
lang_data = lang_data[["lang", "iso_code", "wals_code", "script", "variety",
                       "name", "family", "genus", "d_syn", "train_frac_estimate",
                       "20A", "21A", "21B", "22A", "23A", "24A", "25A", "25B",
                       "26A", "27A", "28A", "29A", "ttr_flores", "ttr", "ttr_sd",
                       "msp", "msp_sd", "ws", "ws_sd", "wh", "wh_sd", "lh",
                       "lh_sd", "is", "is_sd", "mfh", "mfh_sd", "-ia", "-ia_sd",
                       "h_dep_avg", "h_codep_avg"]]

In [55]:
lang_data.to_csv("lang_data.csv", index=False)