# Initial multilingual baseline (no training)

In [2]:
from models import models

model_name = models["multilingual"]
model_name

'FacebookAI/xlm-roberta-base'

In [None]:
from pair_encoder import PairEncoder

model = PairEncoder(
    model_name=model_name, max_length=200, device="cuda", seed=42
)

In [4]:
from util import get_langs
langs = get_langs()
langs

['amh', 'arq', 'ary', 'eng', 'esp', 'hau', 'kin', 'mar', 'tel']

In [None]:
from pair_encoder.evaluation import CorrelationEvaluator, get_correlation
import pandas as pd


from util import get_data, get_pairs, eval_and_submit

def eval_lang(model, lang, save_name):
    data = {
        "dev": get_pairs(get_data(lang=lang, train=False)),
        "test": get_pairs(get_data(lang=lang, test=True))
    }
    correlations = {
        "dev": get_correlation(test=data["dev"], pair_encoder=model),
        "test": get_correlation(test=data["test"], pair_encoder=model)
    }
    eval_and_submit(
        pair_encoder=model,
        lang=lang,
        model_name=model_name,
        timestamp=f"{lang}-test-{save_name}",
        evaluation_phase=True
    )
    return correlations

baseline_no_train = {}
for lang in langs:
    print(lang)
    baseline_no_train[lang] = eval_lang(model, lang, save_name="baseline-notrain")

In [6]:
df_baseline_notrain = pd.DataFrame(baseline_no_train).T
df_prefix = "xlm-r-notrain"
df_baseline_notrain.columns = [f"{df_prefix}_{c}" for c in df_baseline_notrain.columns]
df_baseline_notrain.T

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
xlm-r-notrain_dev,-0.053004,-0.111146,-0.285475,-0.120565,0.223809,-0.091913,-0.181296,-0.151347,-0.166747
xlm-r-notrain_test,0.124478,-0.01099,-0.281222,-0.042338,,-0.007503,0.01929,-0.102428,-0.146766


In [7]:
del model

# Train on all data

In [8]:
import warnings
from data_prep import generate_data

warnings.filterwarnings('ignore')
generate_data()

df_train = pd.read_csv("data/train.csv")
df_eval = pd.read_csv("data/eval.csv")
df_test = pd.read_csv("data/test.csv")

df_train.head()

Created 15123 train, 1390 eval and 5800 test samples.
Created 12098 holdout train and 3025 holdout test samples.


Unnamed: 0,PairID,Score,s1,s2,language
0,Pair_ID_amh_train_1,0.88,መግለጫውን የተከታተለው የአዲስ አበባው ዘጋቢያችን ሰሎሞን ሙጬ ዝርዝር ዘ...,በስፍራው ተገኝቶ የተከታተለው የአዲስ አበባው ዘጋቢያችን ሰሎሞን ሙጬ ያጠ...,amh
1,Pair_ID_amh_train_2,0.25,የኛ ዴሞክራሲ የእንግሊዝ ስርዓት ነው ፤ ለጊዜው ሊያገለግል ይችላል ።,( ሀ) “በሕጉ ዙሪያ ያለው አጥር ” ከንቱ ሆኖ የቀረው ለምንድን ነው ?,amh
2,Pair_ID_amh_train_3,0.36,በዛሬው ጊዜ ከፍቅራዊ ደግነቱ መጠቀም የምንችለውስ እንዴት ነው ?,ጥንቃቄ ማድረግ የምንችለውስ እንዴት ነው ?,amh
3,Pair_ID_amh_train_4,0.27,ግን ምን አይነት ቁልፍ ?,ምን አይነት ፍቅር ነዉ ?,amh
4,Pair_ID_amh_train_5,0.56,ምን አይነት ተንኮለኛ ነው ?,አጭር ሰው ግን የምር ተንኮለኛ ነው ?,amh


In [None]:
# train PairEncoder on all data
from pair_encoder import train_encoder

train_pairs = get_pairs(df_train)
eval_pairs = get_pairs(df_eval)
test_pairs = get_pairs(df_test)

evaluator = CorrelationEvaluator.load(eval_pairs)

encoder, history = train_encoder(
    train_samples=train_pairs,
    upscaling_samples=None,
    evaluator=evaluator,
    timestamp=f"{model_name}-train-all",
    model_name=model_name,
    similarity_model=None,
    batch_size=32,
    learning_rate=2e-5,
    max_grad_norm=1.0,
    epochs=5,
    eval_steps=0,
    max_length=200,
    k=0,
    weak_training_epochs=2,  # used if k > 0
    seed=42,
    save_to=None,
    verbose=True,
    device="cuda"
)

In [10]:
baseline_train_all = {}
for lang in langs:
    print(lang)
    baseline_train_all[lang] = eval_lang(encoder, lang, save_name="baseline-train-all")

amh
arq
ary
eng
esp
hau
kin
mar
tel


In [11]:
df_baseline_all = pd.DataFrame(baseline_train_all).T
df_prefix = "xlm-r-all"
df_baseline_all.columns = [f"{df_prefix}_{c}" for c in df_baseline_all.columns]
df_baseline_all.T

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
xlm-r-all_dev,0.865688,0.592243,0.839578,0.808798,0.712409,0.763927,0.598966,0.842387,0.803438
xlm-r-all_test,0.834218,0.598772,0.830448,0.836942,,0.707398,0.674758,0.859937,0.8575


# Train per language

In [None]:
baseline_train_lang = {}

for lang in langs:
    print(f"Training on {lang}...")
    lang_train = get_data(lang=lang, train=True)
    lang_dev = get_data(lang=lang, train=False)

    train_pairs = get_pairs(lang_train)
    eval_pairs = get_pairs(lang_dev)
    evaluator = CorrelationEvaluator.load(eval_pairs)

    encoder, _ = train_encoder(
        train_samples=train_pairs,
        upscaling_samples=None,
        evaluator=evaluator,
        timestamp=f"{model_name}-train-{lang}",
        model_name=model_name,
        similarity_model=None,
        batch_size=32,
        learning_rate=2e-5,
        max_grad_norm=1.0,
        epochs=5,
        eval_steps=0,
        max_length=200,
        k=0,
        weak_training_epochs=2,  # used if k > 0
        seed=42,
        save_to=None,
        verbose=True,
        device="cuda"
    )
    
    baseline_train_lang[lang] = eval_lang(encoder, lang, save_name="baseline-train-lang")

In [13]:
df_baseline_lang = pd.DataFrame(baseline_train_lang).T
df_prefix = "xlm-r-lang"
df_baseline_lang.columns = [f"{df_prefix}_{c}" for c in df_baseline_lang.columns]
df_baseline_lang.T

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
xlm-r-lang_dev,0.833848,0.573243,0.826105,0.81045,0.650131,0.754098,0.48673,0.828575,0.805651
xlm-r-lang_test,0.818965,0.47665,0.822268,0.834596,,0.701724,0.567633,0.858362,0.807783


In [14]:
final_df = pd.concat([df_baseline_notrain.T, df_baseline_all.T, df_baseline_lang.T], axis=0)
final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
xlm-r-notrain_dev,-0.053004,-0.111146,-0.285475,-0.120565,0.223809,-0.091913,-0.181296,-0.151347,-0.166747
xlm-r-notrain_test,0.124478,-0.01099,-0.281222,-0.042338,,-0.007503,0.01929,-0.102428,-0.146766
xlm-r-all_dev,0.865688,0.592243,0.839578,0.808798,0.712409,0.763927,0.598966,0.842387,0.803438
xlm-r-all_test,0.834218,0.598772,0.830448,0.836942,,0.707398,0.674758,0.859937,0.8575
xlm-r-lang_dev,0.833848,0.573243,0.826105,0.81045,0.650131,0.754098,0.48673,0.828575,0.805651
xlm-r-lang_test,0.818965,0.47665,0.822268,0.834596,,0.701724,0.567633,0.858362,0.807783


In [15]:
# update esp values from codalab
final_df.loc["xlm-r-notrain_test", "esp"] = 0.0173
final_df.loc["xlm-r-all_test", "esp"] = 0.7139
final_df.loc["xlm-r-lang_test", "esp"] = 	0.6973

final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
xlm-r-notrain_dev,-0.053004,-0.111146,-0.285475,-0.120565,0.223809,-0.091913,-0.181296,-0.151347,-0.166747
xlm-r-notrain_test,0.124478,-0.01099,-0.281222,-0.042338,0.0173,-0.007503,0.01929,-0.102428,-0.146766
xlm-r-all_dev,0.865688,0.592243,0.839578,0.808798,0.712409,0.763927,0.598966,0.842387,0.803438
xlm-r-all_test,0.834218,0.598772,0.830448,0.836942,0.7139,0.707398,0.674758,0.859937,0.8575
xlm-r-lang_dev,0.833848,0.573243,0.826105,0.81045,0.650131,0.754098,0.48673,0.828575,0.805651
xlm-r-lang_test,0.818965,0.47665,0.822268,0.834596,0.6973,0.701724,0.567633,0.858362,0.807783


In [16]:
final_df = final_df * 100
final_df = final_df.applymap(lambda x: round(x, 2))
final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
xlm-r-notrain_dev,-5.3,-11.11,-28.55,-12.06,22.38,-9.19,-18.13,-15.13,-16.67
xlm-r-notrain_test,12.45,-1.1,-28.12,-4.23,1.73,-0.75,1.93,-10.24,-14.68
xlm-r-all_dev,86.57,59.22,83.96,80.88,71.24,76.39,59.9,84.24,80.34
xlm-r-all_test,83.42,59.88,83.04,83.69,71.39,70.74,67.48,85.99,85.75
xlm-r-lang_dev,83.38,57.32,82.61,81.05,65.01,75.41,48.67,82.86,80.57
xlm-r-lang_test,81.9,47.66,82.23,83.46,69.73,70.17,56.76,85.84,80.78


In [17]:
columns = [
    "arq", "amh", "eng", "hau", "kin", "mar", "ary", "esp", "tel"
]

final_df = final_df[columns]
final_df

Unnamed: 0,arq,amh,eng,hau,kin,mar,ary,esp,tel
xlm-r-notrain_dev,-11.11,-5.3,-12.06,-9.19,-18.13,-15.13,-28.55,22.38,-16.67
xlm-r-notrain_test,-1.1,12.45,-4.23,-0.75,1.93,-10.24,-28.12,1.73,-14.68
xlm-r-all_dev,59.22,86.57,80.88,76.39,59.9,84.24,83.96,71.24,80.34
xlm-r-all_test,59.88,83.42,83.69,70.74,67.48,85.99,83.04,71.39,85.75
xlm-r-lang_dev,57.32,83.38,81.05,75.41,48.67,82.86,82.61,65.01,80.57
xlm-r-lang_test,47.66,81.9,83.46,70.17,56.76,85.84,82.23,69.73,80.78


In [18]:
index = "base all lang".split()
dev_df = final_df[final_df.index.str.contains("dev")]
dev_df.index = index

test_df = final_df[final_df.index.str.contains("test")]
test_df.index = index

In [19]:
print(dev_df.to_latex(float_format="%.2f"))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & arq & amh & eng & hau & kin & mar & ary & esp & tel \\
\midrule
base & -11.11 & -5.30 & -12.06 & -9.19 & -18.13 & -15.13 & -28.55 & 22.38 & -16.67 \\
all & 59.22 & 86.57 & 80.88 & 76.39 & 59.90 & 84.24 & 83.96 & 71.24 & 80.34 \\
lang & 57.32 & 83.38 & 81.05 & 75.41 & 48.67 & 82.86 & 82.61 & 65.01 & 80.57 \\
\bottomrule
\end{tabular}



In [20]:
print(test_df.to_latex(float_format="%.2f"))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & arq & amh & eng & hau & kin & mar & ary & esp & tel \\
\midrule
base & -1.10 & 12.45 & -4.23 & -0.75 & 1.93 & -10.24 & -28.12 & 1.73 & -14.68 \\
all & 59.88 & 83.42 & 83.69 & 70.74 & 67.48 & 85.99 & 83.04 & 71.39 & 85.75 \\
lang & 47.66 & 81.90 & 83.46 & 70.17 & 56.76 & 85.84 & 82.23 & 69.73 & 80.78 \\
\bottomrule
\end{tabular}

