# Initial baseline (no training)

In [None]:
from util import get_langs
model_name = "intfloat/e5-base-v2"
langs = get_langs()
langs

In [3]:
from sentence_transformers import SentenceTransformer
import pandas as pd
from util import get_data, do_evaluation, get_spearman

def eval_lang(model, lang, submission_folder):
    data = {
        "dev": get_data(lang=lang, train=False, prefix="query:"),
        "test": get_data(lang=lang, test=True, prefix="query:")
    }
    correlations = {}
    for split, lang_df in data.items():
        if lang == "esp" and split == "test":
            do_evaluation(
                model=model,
                lang=lang,
                df=lang_df,
                submit=split == "test",
                timestamp=f"{submission_folder}-{split}",
                model_name=model_name
            )
            correlations[split] = "-"
            continue
        preds = do_evaluation(
            model=model,
            lang=lang,
            df=lang_df,
            submit=split == "test",
            timestamp=f"{submission_folder}-{split}",
            model_name=model_name
        )
        spearmanc = get_spearman(
            gold_scores=lang_df.Score.values,
            pred_scores=preds
        )
        correlations[split] = spearmanc
    return correlations
        
def eval_all(model, df_prefix="baseline"):
    scores = {}
    for lang in langs:
        scores[lang] = eval_lang(model, lang, submission_folder=df_prefix)
        
    _df = pd.DataFrame(scores).T
    _df.columns = [f"{df_prefix}_{c}" for c in _df.columns]
    return _df.T
        
print("Evaluating baseline model (no training) for all languages...")
model = SentenceTransformer(model_name)
df_baseline = eval_all(model)
df_baseline

Evaluating baseline model (no training) for all languages...


Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
baseline_dev,0.072551,0.406675,0.536452,0.8242,0.634023,0.362391,0.328688,0.579424,0.352853
baseline_test,0.090198,0.439407,0.154146,0.826929,-,0.407946,0.482348,0.527602,0.286945


# Train on all data

In [4]:
import warnings
import sys
sys.path.append("..")
from src.data.create import generate_data

warnings.filterwarnings('ignore')
generate_data()

df_train = pd.read_csv("data/train.csv")
df_eval = pd.read_csv("data/eval.csv")
df_test = pd.read_csv("data/test.csv")

df_train.head()

Created 15123 train, 1390 eval and 5800 test samples.
Created 12098 holdout train and 3025 holdout test samples.


Unnamed: 0,PairID,Score,s1,s2,language
0,Pair_ID_amh_train_1,0.88,መግለጫውን የተከታተለው የአዲስ አበባው ዘጋቢያችን ሰሎሞን ሙጬ ዝርዝር ዘ...,በስፍራው ተገኝቶ የተከታተለው የአዲስ አበባው ዘጋቢያችን ሰሎሞን ሙጬ ያጠ...,amh
1,Pair_ID_amh_train_2,0.25,የኛ ዴሞክራሲ የእንግሊዝ ስርዓት ነው ፤ ለጊዜው ሊያገለግል ይችላል ።,( ሀ) “በሕጉ ዙሪያ ያለው አጥር ” ከንቱ ሆኖ የቀረው ለምንድን ነው ?,amh
2,Pair_ID_amh_train_3,0.36,በዛሬው ጊዜ ከፍቅራዊ ደግነቱ መጠቀም የምንችለውስ እንዴት ነው ?,ጥንቃቄ ማድረግ የምንችለውስ እንዴት ነው ?,amh
3,Pair_ID_amh_train_4,0.27,ግን ምን አይነት ቁልፍ ?,ምን አይነት ፍቅር ነዉ ?,amh
4,Pair_ID_amh_train_5,0.56,ምን አይነት ተንኮለኛ ነው ?,አጭር ሰው ግን የምር ተንኮለኛ ነው ?,amh


In [5]:
from torch.optim import AdamW

config = {
    "epochs": 5,
    "scheduler": "WarmupLinear",
    "optimizer_class": AdamW,
    "optimizer_params": {"lr": 2e-5},
    "weight_decay": 0.01,
    "save_best_model": True,
    "max_grad_norm": 1,
    "use_amp": False,
    "show_progress_bar": True,
    "checkpoint_save_total_limit": 1,
}


In [9]:
from sentence_transformer_train import train_on_df, evaluator_from_df

evaluator = evaluator_from_df(df_eval)
trained_model = train_on_df(
    model=model_name, df=df_train, evaluator=evaluator, batch_size=16, config=config, model_name="base-english"
)

Epoch:   0%|          | 0/5 [00:00<?, ?it/s]

Iteration:   0%|          | 0/946 [00:00<?, ?it/s]

Epoch: 0 | Steps: -1 | Score: 0.5059395077859683


Iteration:   0%|          | 0/946 [00:00<?, ?it/s]

Epoch: 1 | Steps: -1 | Score: 0.5320090663995718


Iteration:   0%|          | 0/946 [00:00<?, ?it/s]

Epoch: 2 | Steps: -1 | Score: 0.5515127126751362


Iteration:   0%|          | 0/946 [00:00<?, ?it/s]

Epoch: 3 | Steps: -1 | Score: 0.5514603711288542


Iteration:   0%|          | 0/946 [00:00<?, ?it/s]

Epoch: 4 | Steps: -1 | Score: 0.5603993516804403


In [11]:
from sentence_transformers import SentenceTransformer

if trained_model:
    del trained_model
model = SentenceTransformer("trained-models/trained-bi-encoders/base-english")
trained_all_df = eval_all(model, df_prefix="trained_all")
trained_all_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
trained_all_dev,0.092182,0.589569,0.755919,0.818172,0.646974,0.665888,0.601034,0.660979,0.389768
trained_all_test,0.144815,0.593162,0.772974,0.828843,-,0.618736,0.681478,0.695928,0.4364


In [None]:
all_lang_scores = {}

for lang in langs:
    print(lang)
    lang_train = get_data(lang=lang, train=True)
    lang_dev = get_data(lang=lang, train=False)
    
    model = train_on_df(
        model=model_name,
        df=lang_train,
        config=config,
        batch_size=16,
        model_name=lang,
        evaluator=evaluator_from_df(lang_dev),
    )
    lang_scores = eval_lang(model, lang, submission_folder="trained_lang")
    all_lang_scores[lang] = lang_scores
    

In [13]:
all_lang_df = pd.DataFrame(all_lang_scores).T
all_lang_df.columns = [f"trained_lang_{c}" for c in all_lang_df.columns]

final_df = pd.concat([df_baseline, trained_all_df, all_lang_df.T], axis=0)
final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
baseline_dev,0.072551,0.406675,0.536452,0.8242,0.634023,0.362391,0.328688,0.579424,0.352853
baseline_test,0.090198,0.439407,0.154146,0.826929,-,0.407946,0.482348,0.527602,0.286945
trained_all_dev,0.092182,0.589569,0.755919,0.818172,0.646974,0.665888,0.601034,0.660979,0.389768
trained_all_test,0.144815,0.593162,0.772974,0.828843,-,0.618736,0.681478,0.695928,0.4364
trained_lang_dev,0.051908,0.4489,0.547476,0.828768,0.68562,0.665392,0.565306,0.671489,0.349526
trained_lang_test,0.136983,0.552955,0.361079,0.835366,-,0.636332,0.63596,0.678755,0.342087


In [14]:
# update esp values from codalab
final_df.loc["baseline_test", "esp"] = 0.5875
final_df.loc["trained_all_test", "esp"] = 0.6916
final_df.loc["trained_lang_test", "esp"] = 0.6902

final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
baseline_dev,0.072551,0.406675,0.536452,0.8242,0.634023,0.362391,0.328688,0.579424,0.352853
baseline_test,0.090198,0.439407,0.154146,0.826929,0.5875,0.407946,0.482348,0.527602,0.286945
trained_all_dev,0.092182,0.589569,0.755919,0.818172,0.646974,0.665888,0.601034,0.660979,0.389768
trained_all_test,0.144815,0.593162,0.772974,0.828843,0.6916,0.618736,0.681478,0.695928,0.4364
trained_lang_dev,0.051908,0.4489,0.547476,0.828768,0.68562,0.665392,0.565306,0.671489,0.349526
trained_lang_test,0.136983,0.552955,0.361079,0.835366,0.6902,0.636332,0.63596,0.678755,0.342087


In [15]:
final_df = final_df * 100
final_df = final_df.applymap(lambda x: round(x, 2))
final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
baseline_dev,7.26,40.67,53.65,82.42,63.4,36.24,32.87,57.94,35.29
baseline_test,9.02,43.94,15.41,82.69,58.75,40.79,48.23,52.76,28.69
trained_all_dev,9.22,58.96,75.59,81.82,64.7,66.59,60.1,66.1,38.98
trained_all_test,14.48,59.32,77.3,82.88,69.16,61.87,68.15,69.59,43.64
trained_lang_dev,5.19,44.89,54.75,82.88,68.56,66.54,56.53,67.15,34.95
trained_lang_test,13.7,55.3,36.11,83.54,69.02,63.63,63.6,67.88,34.21


In [16]:
# change order from
# amh	arq	ary	eng	esp	hau	kin	mar	tel
# to 
# arq   & amh   & eng   & hau   & kin   & mar   & ary   & esp   & tel
columns = [
    "arq", "amh", "eng", "hau", "kin", "mar", "ary", "esp", "tel"
]

final_df = final_df[columns]
final_df

Unnamed: 0,arq,amh,eng,hau,kin,mar,ary,esp,tel
baseline_dev,40.67,7.26,82.42,36.24,32.87,57.94,53.65,63.4,35.29
baseline_test,43.94,9.02,82.69,40.79,48.23,52.76,15.41,58.75,28.69
trained_all_dev,58.96,9.22,81.82,66.59,60.1,66.1,75.59,64.7,38.98
trained_all_test,59.32,14.48,82.88,61.87,68.15,69.59,77.3,69.16,43.64
trained_lang_dev,44.89,5.19,82.88,66.54,56.53,67.15,54.75,68.56,34.95
trained_lang_test,55.3,13.7,83.54,63.63,63.6,67.88,36.11,69.02,34.21


In [17]:
# split the df into dev and test, based on the last prefix
index = "base all lang".split()
dev_df = final_df[final_df.index.str.contains("dev")]
dev_df.index = index

test_df = final_df[final_df.index.str.contains("test")]
test_df.index = index

In [18]:
print(dev_df.to_latex(float_format="%.2f"))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & arq & amh & eng & hau & kin & mar & ary & esp & tel \\
\midrule
base & 40.67 & 7.26 & 82.42 & 36.24 & 32.87 & 57.94 & 53.65 & 63.40 & 35.29 \\
all & 58.96 & 9.22 & 81.82 & 66.59 & 60.10 & 66.10 & 75.59 & 64.70 & 38.98 \\
lang & 44.89 & 5.19 & 82.88 & 66.54 & 56.53 & 67.15 & 54.75 & 68.56 & 34.95 \\
\bottomrule
\end{tabular}



In [19]:
print(test_df.to_latex(float_format="%.2f"))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & arq & amh & eng & hau & kin & mar & ary & esp & tel \\
\midrule
base & 43.94 & 9.02 & 82.69 & 40.79 & 48.23 & 52.76 & 15.41 & 58.75 & 28.69 \\
all & 59.32 & 14.48 & 82.88 & 61.87 & 68.15 & 69.59 & 77.30 & 69.16 & 43.64 \\
lang & 55.30 & 13.70 & 83.54 & 63.63 & 63.60 & 67.88 & 36.11 & 69.02 & 34.21 \\
\bottomrule
\end{tabular}

