# Language Selection

In [1]:
import pandas as pd
import scipy.stats as stats

In [2]:
lang_data = pd.read_csv('../lang_data/lang_data.csv')

## Union of Languages

In [3]:
MORPH_FEATS = ["20A", "21A", "21B", "22A", "23A", "24A", "25A", "25B", "26A",
               "27A", "28A", "29A"]

In [4]:
morph_feat_langs = lang_data[lang_data[MORPH_FEATS].notna().sum(axis=1) >= 1]
compl_measure_langs = lang_data[lang_data['ttr'].notna()]
h_dep_langs = lang_data[lang_data["h_dep_avg"].notna()]
h_codep_langs = lang_data[lang_data["h_codep_avg"].notna()]

In [5]:
print(len(morph_feat_langs))
print(len(compl_measure_langs))
print(len(h_dep_langs))
print(len(h_codep_langs))

125
34
45
44


In [14]:
# Union of dataframes
any_feat_langs = (
    pd.concat(
        [morph_feat_langs, compl_measure_langs, h_dep_langs, h_codep_langs],
        ignore_index=True
    )
    .drop_duplicates(ignore_index=True)
)

In [15]:
any_feat_langs.to_csv("any_feat_langs.csv", index=False)

In [18]:
# Keep lang_data rows not in any_feat_langs based on "lang" columns
no_feat_langs = lang_data[~lang_data["lang"].isin(any_feat_langs["lang"])]

In [20]:
no_feat_langs.to_csv("no_feat_langs.csv", index=False)

In [25]:
ttr_langs = pd.read_csv("ttr_langs.csv")

In [27]:
# Keep any_feat_langs rows not in ttr_langs based on "lang" columns
langs_to_translate = any_feat_langs[~any_feat_langs["lang"].isin(ttr_langs["lang"])]

In [29]:
langs_to_translate.to_csv("langs_to_translate.csv", index=False)

## Creating Translation Jobs

In [37]:
JOB_TEMPLATE = """#!/bin/bash

#SBATCH --time=00:60:00
#SBATCH --partition=gpu
#SBATCH --job-name={lang}
#SBATCH --mem=64G
#SBATCH --gpus-per-node=1

source ~/thesis/venv/bin/activate
python ~/thesis/translate.py -t {lang}
deactivate

"""

In [38]:
for lang in langs_to_translate["lang"]:
    with open(f"../translation/jobs/translate_{lang}.sh", "w") as f:
        f.write(JOB_TEMPLATE.format(lang=lang))

In [34]:
# Sort langs_to_translate by "lang"
langs_to_translate = langs_to_translate.sort_values("lang")

In [36]:
langs_to_translate.to_csv("langs_to_translate.csv", index=False)