In [1]:
%load_ext autoreload
%autoreload 2

# Initial baseline (no training)

In [2]:
from util import get_langs
model_name = "intfloat/multilingual-e5-base"
langs = get_langs()
langs

  from tqdm.autonotebook import tqdm, trange


['amh', 'arq', 'ary', 'eng', 'esp', 'hau', 'kin', 'mar', 'tel']

In [13]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from util import get_data, do_evaluation, get_spearman

def eval_lang(model, lang, submission_folder):
    data = {
        "dev": get_data(lang=lang, train=False, prefix="query:"),
        "test": get_data(lang=lang, test=True, prefix="query:")
    }
    correlations = {}
    for split, lang_df in data.items():
        if lang == "esp" and split == "test":
            do_evaluation(
                model=model,
                lang=lang,
                df=lang_df,
                submit=split == "test",
                timestamp=f"{submission_folder}-{split}",
                model_name=model_name
            )
            correlations[split] = "-"
            continue
        preds = do_evaluation(
            model=model,
            lang=lang,
            df=lang_df,
            submit=split == "test",
            timestamp=f"{submission_folder}-{split}",
            model_name=model_name
        )
        spearmanc = get_spearman(
            gold_scores=lang_df.Score.values,
            pred_scores=preds
        )
        correlations[split] = spearmanc
    return correlations
        
def eval_all(model, df_prefix="baseline"):
    scores = {}
    for lang in langs:
        scores[lang] = eval_lang(model, lang, submission_folder=df_prefix)
        
    _df = pd.DataFrame(scores).T
    _df.columns = [f"{df_prefix}_{c}" for c in _df.columns]
    return _df.T
        
print("Evaluating baseline model (no training) for all languages...")
model = SentenceTransformer(model_name)
df_baseline = eval_all(model)
df_baseline

Evaluating baseline model (no training) for all languages...
Evaluating amh on dev split...
Spearman correlation for amh on dev split: 0.618234541251654
Evaluating amh on test split...
Spearman correlation for amh on test split: 0.7256326276475369
Evaluating arq on dev split...
Spearman correlation for arq on dev split: 0.396957017567906
Evaluating arq on test split...
Spearman correlation for arq on test split: 0.45324679732323686
Evaluating ary on dev split...
Spearman correlation for ary on dev split: 0.442104837913758
Evaluating ary on test split...
Spearman correlation for ary on test split: 0.40143450894729094
Evaluating eng on dev split...
Spearman correlation for eng on dev split: 0.7830662981686548
Evaluating eng on test split...
Spearman correlation for eng on test split: 0.8039033607994176
Evaluating esp on dev split...
Spearman correlation for esp on dev split: 0.6263002622450242
Evaluating esp on test split...
Evaluating hau on dev split...
Spearman correlation for hau on 

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
baseline_dev,0.618235,0.396957,0.442105,0.783066,0.6263,0.45009,0.277994,0.724796,0.773459
baseline_test,0.725633,0.453247,0.401435,0.803903,-,0.512341,0.513835,0.773696,0.774269


# Train on all data

In [14]:
import warnings
import sys
sys.path.append("..")
from src.data.create import generate_data

warnings.filterwarnings('ignore')
generate_data()

df_train = pd.read_csv("data/train.csv")
df_eval = pd.read_csv("data/eval.csv")
df_test = pd.read_csv("data/test.csv")

df_train.head()

Created 15123 train, 1390 eval and 5800 test samples.
Created 12098 holdout train and 3025 holdout test samples.


Unnamed: 0,PairID,Score,s1,s2,language
0,Pair_ID_amh_train_1,0.88,መግለጫውን የተከታተለው የአዲስ አበባው ዘጋቢያችን ሰሎሞን ሙጬ ዝርዝር ዘ...,በስፍራው ተገኝቶ የተከታተለው የአዲስ አበባው ዘጋቢያችን ሰሎሞን ሙጬ ያጠ...,amh
1,Pair_ID_amh_train_2,0.25,የኛ ዴሞክራሲ የእንግሊዝ ስርዓት ነው ፤ ለጊዜው ሊያገለግል ይችላል ።,( ሀ) “በሕጉ ዙሪያ ያለው አጥር ” ከንቱ ሆኖ የቀረው ለምንድን ነው ?,amh
2,Pair_ID_amh_train_3,0.36,በዛሬው ጊዜ ከፍቅራዊ ደግነቱ መጠቀም የምንችለውስ እንዴት ነው ?,ጥንቃቄ ማድረግ የምንችለውስ እንዴት ነው ?,amh
3,Pair_ID_amh_train_4,0.27,ግን ምን አይነት ቁልፍ ?,ምን አይነት ፍቅር ነዉ ?,amh
4,Pair_ID_amh_train_5,0.56,ምን አይነት ተንኮለኛ ነው ?,አጭር ሰው ግን የምር ተንኮለኛ ነው ?,amh


In [15]:
from torch.optim import AdamW

config = {
    "epochs": 5,
    "scheduler": "WarmupLinear",
    "optimizer_class": AdamW,
    "optimizer_params": {"lr": 2e-5},
    "weight_decay": 0.01,
    "save_best_model": True,
    "max_grad_norm": 1,
    "use_amp": False,
    "show_progress_bar": True,
    "checkpoint_save_total_limit": 1,
}


In [16]:
from sentence_transformer_train import train_on_df, evaluator_from_df

# evaluator = evaluator_from_df(df_eval)
# trained_model = train_on_df(
#     model=model_name, df=df_train, evaluator=evaluator, batch_size=16, config=config
# )

In [17]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("trained-models/trained-bi-encoder")
trained_all_df = eval_all(model, df_prefix="trained_all")
trained_all_df

Evaluating amh on dev split...
Spearman correlation for amh on dev split: 0.7847016894362107
Evaluating amh on test split...
Spearman correlation for amh on test split: 0.8206199084055144
Evaluating arq on dev split...
Spearman correlation for arq on dev split: 0.5426262953960982
Evaluating arq on test split...
Spearman correlation for arq on test split: 0.5928168840828099
Evaluating ary on dev split...
Spearman correlation for ary on dev split: 0.7883691189121346
Evaluating ary on test split...
Spearman correlation for ary on test split: 0.7826506274700165
Evaluating eng on dev split...
Spearman correlation for eng on dev split: 0.8143942356957292
Evaluating eng on test split...
Spearman correlation for eng on test split: 0.8353184072337156
Evaluating esp on dev split...
Spearman correlation for esp on dev split: 0.6854410828754922
Evaluating esp on test split...
Evaluating hau on dev split...
Spearman correlation for hau on dev split: 0.7327382021915627
Evaluating hau on test split..

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
trained_all_dev,0.784702,0.542626,0.788369,0.814394,0.685441,0.732738,0.625477,0.815611,0.822729
trained_all_test,0.82062,0.592817,0.782651,0.835318,-,0.683583,0.716118,0.872732,0.832485


In [None]:
all_lang_scores = {}

for lang in langs:
    print(lang)
    lang_train = get_data(lang=lang, train=True)
    lang_dev = get_data(lang=lang, train=False)
    
    model = train_on_df(
        model=model_name,
        df=lang_train,
        config=config,
        batch_size=16,
        model_name=lang,
        evaluator=evaluator_from_df(lang_dev),
    )
    lang_scores = eval_lang(model, lang, submission_folder="trained_lang")
    all_lang_scores[lang] = lang_scores
    

In [46]:
all_lang_df = pd.DataFrame(all_lang_scores).T
all_lang_df.columns = [f"trained_lang_{c}" for c in all_lang_df.columns]

final_df = pd.concat([df_baseline, trained_all_df, all_lang_df.T], axis=0)
final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
baseline_dev,0.618235,0.396957,0.442105,0.783066,0.6263,0.45009,0.277994,0.724796,0.773459
baseline_test,0.725633,0.453247,0.401435,0.803903,-,0.512341,0.513835,0.773696,0.774269
trained_all_dev,0.784702,0.542626,0.788369,0.814394,0.685441,0.732738,0.625477,0.815611,0.822729
trained_all_test,0.82062,0.592817,0.782651,0.835318,-,0.683583,0.716118,0.872732,0.832485
trained_lang_dev,0.777855,0.597132,0.737882,0.820961,0.631627,0.729118,0.656271,0.805556,0.797541
trained_lang_test,0.814582,0.606782,0.777503,0.835542,-,0.699717,0.718709,0.879054,0.822405


In [47]:
# update esp values from codalab
final_df.loc["baseline_test", "esp"] = 0.5875
final_df.loc["trained_all_test", "esp"] = 0.6916
final_df.loc["trained_lang_test", "esp"] = 0.6902

final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
baseline_dev,0.618235,0.396957,0.442105,0.783066,0.6263,0.45009,0.277994,0.724796,0.773459
baseline_test,0.725633,0.453247,0.401435,0.803903,0.5875,0.512341,0.513835,0.773696,0.774269
trained_all_dev,0.784702,0.542626,0.788369,0.814394,0.685441,0.732738,0.625477,0.815611,0.822729
trained_all_test,0.82062,0.592817,0.782651,0.835318,0.6916,0.683583,0.716118,0.872732,0.832485
trained_lang_dev,0.777855,0.597132,0.737882,0.820961,0.631627,0.729118,0.656271,0.805556,0.797541
trained_lang_test,0.814582,0.606782,0.777503,0.835542,0.6902,0.699717,0.718709,0.879054,0.822405


In [48]:
final_df = final_df * 100
final_df = final_df.applymap(lambda x: round(x, 2))
final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
baseline_dev,61.82,39.7,44.21,78.31,62.63,45.01,27.8,72.48,77.35
baseline_test,72.56,45.32,40.14,80.39,58.75,51.23,51.38,77.37,77.43
trained_all_dev,78.47,54.26,78.84,81.44,68.54,73.27,62.55,81.56,82.27
trained_all_test,82.06,59.28,78.27,83.53,69.16,68.36,71.61,87.27,83.25
trained_lang_dev,77.79,59.71,73.79,82.1,63.16,72.91,65.63,80.56,79.75
trained_lang_test,81.46,60.68,77.75,83.55,69.02,69.97,71.87,87.91,82.24


In [52]:
# change order from
# amh	arq	ary	eng	esp	hau	kin	mar	tel
# to 
# arq   & amh   & eng   & hau   & kin   & mar   & ary   & esp   & tel
columns = [
    "arq", "amh", "eng", "hau", "kin", "mar", "ary", "esp", "tel"
]

final_df = final_df[columns]
final_df

Unnamed: 0,arq,amh,eng,hau,kin,mar,ary,esp,tel
baseline_dev,39.7,61.82,78.31,45.01,27.8,72.48,44.21,62.63,77.35
baseline_test,45.32,72.56,80.39,51.23,51.38,77.37,40.14,58.75,77.43
trained_all_dev,54.26,78.47,81.44,73.27,62.55,81.56,78.84,68.54,82.27
trained_all_test,59.28,82.06,83.53,68.36,71.61,87.27,78.27,69.16,83.25
trained_lang_dev,59.71,77.79,82.1,72.91,65.63,80.56,73.79,63.16,79.75
trained_lang_test,60.68,81.46,83.55,69.97,71.87,87.91,77.75,69.02,82.24


In [53]:
# split the df into dev and test, based on the last prefix
index = "base all lang".split()
dev_df = final_df[final_df.index.str.contains("dev")]
dev_df.index = index

test_df = final_df[final_df.index.str.contains("test")]
test_df.index = index

In [54]:
print(dev_df.to_latex(float_format="%.2f"))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & arq & amh & eng & hau & kin & mar & ary & esp & tel \\
\midrule
base & 39.70 & 61.82 & 78.31 & 45.01 & 27.80 & 72.48 & 44.21 & 62.63 & 77.35 \\
all & 54.26 & 78.47 & 81.44 & 73.27 & 62.55 & 81.56 & 78.84 & 68.54 & 82.27 \\
lang & 59.71 & 77.79 & 82.10 & 72.91 & 65.63 & 80.56 & 73.79 & 63.16 & 79.75 \\
\bottomrule
\end{tabular}



In [55]:
print(test_df.to_latex(float_format="%.2f"))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & arq & amh & eng & hau & kin & mar & ary & esp & tel \\
\midrule
base & 45.32 & 72.56 & 80.39 & 51.23 & 51.38 & 77.37 & 40.14 & 58.75 & 77.43 \\
all & 59.28 & 82.06 & 83.53 & 68.36 & 71.61 & 87.27 & 78.27 & 69.16 & 83.25 \\
lang & 60.68 & 81.46 & 83.55 & 69.97 & 71.87 & 87.91 & 77.75 & 69.02 & 82.24 \\
\bottomrule
\end{tabular}



In [66]:
s = "trained_all_test"
# last index of _
split = s[s.rfind("_")+1:]


'test'

In [65]:
index = final_df.index.tolist()



['baseline_dev',
 'baseline_test',
 'trained_all_dev',
 'trained_all_test',
 'trained_lang_dev',
 'trained_lang_test']

In [59]:
# fix index for latex, e.g. trained_all_dev -> trained-all_{dev}

final_df.index = final_df.index.str.replace("_", "-")
# add { } around the last part of the index (trained-all-dev -> trained-all_{dev})
final_df.index = final_df.index.str.replace("trained-all-(.*)", r"trained-all_XD{\1}")
final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
baseline-dev,0.618235,0.396957,0.442105,0.783066,0.6263,0.45009,0.277994,0.724796,0.773459
baseline-test,0.725633,0.453247,0.401435,0.803903,0.6263,0.512341,0.513835,0.773696,0.774269
trained-all-dev,0.784702,0.542626,0.788369,0.814394,0.685441,0.732738,0.625477,0.815611,0.822729
trained-all-test,0.82062,0.592817,0.782651,0.835318,0.685441,0.683583,0.716118,0.872732,0.832485
trained-lang-dev,0.773192,0.595116,0.749828,0.819522,0.638906,0.72921,0.663715,0.810774,0.814084
trained-lang-test,0.810138,0.609037,0.773082,0.836387,0.638906,0.703913,0.715451,0.874836,0.820734


In [52]:
# as latex
print(final_df.to_latex())

\begin{tabular}{lrrrrrrrrr}
\toprule
 & amh & arq & ary & eng & esp & hau & kin & mar & tel \\
\midrule
baseline_dev & 0.618235 & 0.396957 & 0.442105 & 0.783066 & 0.626300 & 0.450090 & 0.277994 & 0.724796 & 0.773459 \\
baseline_test & 0.725633 & 0.453247 & 0.401435 & 0.803903 & 0.626300 & 0.512341 & 0.513835 & 0.773696 & 0.774269 \\
trained_all_dev & 0.784702 & 0.542626 & 0.788369 & 0.814394 & 0.685441 & 0.732738 & 0.625477 & 0.815611 & 0.822729 \\
trained_all_test & 0.820620 & 0.592817 & 0.782651 & 0.835318 & 0.685441 & 0.683583 & 0.716118 & 0.872732 & 0.832485 \\
trained_lang_dev & 0.773192 & 0.595116 & 0.749828 & 0.819522 & 0.638906 & 0.729210 & 0.663715 & 0.810774 & 0.814084 \\
trained_lang_test & 0.810138 & 0.609037 & 0.773082 & 0.836387 & 0.638906 & 0.703913 & 0.715451 & 0.874836 & 0.820734 \\
\bottomrule
\end{tabular}

