In [1]:
%load_ext autoreload
%autoreload 2

# E5-base multilingual 

# Initial multilingual baseline (no training)

In [2]:
from util import get_data
model_name = "intfloat/multilingual-e5-base"
model_name

  from tqdm.autonotebook import tqdm, trange


'intfloat/multilingual-e5-base'

In [3]:
from util import get_langs
langs = get_langs()
langs

['amh', 'arq', 'ary', 'eng', 'esp', 'hau', 'kin', 'mar', 'tel']

In [4]:
from torch.optim import AdamW

config = {
    "epochs": 2,
    "scheduler": "WarmupLinear",
    "optimizer_class": AdamW,
    "optimizer_params": {"lr": 2e-5},
    "weight_decay": 0.01,
    "save_best_model": True,
    "max_grad_norm": 1,
    "use_amp": False,
    "show_progress_bar": True,
    "checkpoint_save_total_limit": 0,
}

In [5]:
from util import do_evaluation, get_spearman

def eval_lang(model, df, lang):
    preds = do_evaluation(
        model=model,
        lang=lang,
        df=df,
        submit=False
    )
    spearmanc = get_spearman(
        gold_scores=df.Score.values,
        pred_scores=preds
    )
    return spearmanc

In [6]:
from sklearn.model_selection import KFold
from sentence_transformers import SentenceTransformer
from sentence_transformer_train import train_on_df, evaluator_from_df


import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from util import get_data, get_pairs, eval_and_submit


def eval_kfold(model, train, lang=None, fit=False, nsplits=5):
    kFold = KFold(n_splits=nsplits, random_state=42, shuffle=True)
    correlations = []
    for train_index, test_index in tqdm(kFold.split(train), total=nsplits):
        train_df = train.iloc[train_index]
        test_df = train.iloc[test_index]
        # sample from test_df where lang is lang
        if lang and "language" in test_df.columns:
            langs = [lang]
            tmpsize = len(test_df)
            test_df = test_df[test_df["language"].isin(langs)]
            newsize = len(test_df)
            print(
                f"Reduced test set from {tmpsize} to {newsize} to match {lang}"
            )

        if fit:
            model = SentenceTransformer(model_name)
            model = train_on_df(
                model=model_name, df=train_df, batch_size=16, config=config
            )

        correlation = eval_lang(model=model, df=test_df, lang=lang)
        correlations.append(correlation)
    mean_correlation = np.mean(correlations)
    std_correlation = np.std(correlations)
    return mean_correlation, std_correlation

In [7]:
model = SentenceTransformer(model_name)

In [None]:
baseline_no_train = {}
for lang in tqdm(langs):
    train = get_data(lang=lang, train=True)
    mean_correlation, std_correlation = eval_kfold(
        model, train, lang=None, nsplits=5
    )  # we're only fetching language data to begin with
    baseline_no_train[lang] = (mean_correlation, std_correlation)

In [10]:
baseline_no_train

{'amh': (0.7586214183180987, 0.018783143557279034),
 'arq': (0.5040962424367389, 0.028206664256620186),
 'ary': (0.3603218474023298, 0.015926143512239702),
 'eng': (0.8072301739182689, 0.008707607084371858),
 'esp': (0.6030328794208057, 0.01402867346989389),
 'hau': (0.5237558277213593, 0.019255448083088905),
 'kin': (0.462020717813219, 0.052973172534683206),
 'mar': (0.7700211291469171, 0.013342558995495777),
 'tel': (0.7528091890045397, 0.014858794111326325)}

In [11]:
def process_df(data, prefix=""):
    df = pd.DataFrame(data).T
    df.columns = ["mean", "std"]
    df = df * 100
    df["mean"] = df["mean"].map(lambda x: f"{x:.2f}")
    df["std"] = df["std"].map(lambda x: f"({x:.2f})")
    df[f"{prefix}"] = df["mean"] + " " + df["std"]
    df = df.drop(columns=["mean", "std"])
    return df.T

df_notrain = process_df(baseline_no_train, prefix="e5-multi-notrain")
df_notrain

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
e5-multi-notrain,75.86 (1.88),50.41 (2.82),36.03 (1.59),80.72 (0.87),60.30 (1.40),52.38 (1.93),46.20 (5.30),77.00 (1.33),75.28 (1.49)


# All languages

In [None]:
import pandas as pd

# get all training data
df = pd.read_csv("data/train.csv")
print(df.shape)
df.head()

baseline_all = {}

for lang in langs:
    print(lang)
    mean_correlation, std_correlation = eval_kfold(model, df, lang, fit=True, nsplits=3)
    baseline_all[lang] = (mean_correlation, std_correlation)

In [14]:
df_all = process_df(baseline_all, prefix="e5-all")
df_all

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
e5-all,84.52 (0.88),59.45 (2.34),81.20 (1.44),86.43 (0.55),67.16 (2.44),69.01 (0.19),69.08 (3.43),84.62 (1.35),80.14 (0.97)


# Train per language

In [None]:
baseline_lang = {}
for lang in langs:
    print(lang)
    df = get_data(lang=lang, train=True)
    mean_correlation, std_correlation = eval_kfold(model, df, lang, fit=True, nsplits=3)
    baseline_lang[lang] = (mean_correlation, std_correlation)

In [17]:
df_lang = process_df(baseline_lang, prefix="e5-lang")
df_lang

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
e5-lang,82.27 (2.35),59.50 (3.25),75.73 (1.02),86.72 (1.02),67.21 (0.39),68.43 (2.10),63.04 (3.56),82.89 (0.32),77.94 (1.23)


In [18]:
final_df = pd.concat([df_notrain, df_all, df_lang], axis=0)
final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
e5-multi-notrain,75.86 (1.88),50.41 (2.82),36.03 (1.59),80.72 (0.87),60.30 (1.40),52.38 (1.93),46.20 (5.30),77.00 (1.33),75.28 (1.49)
e5-all,84.52 (0.88),59.45 (2.34),81.20 (1.44),86.43 (0.55),67.16 (2.44),69.01 (0.19),69.08 (3.43),84.62 (1.35),80.14 (0.97)
e5-lang,82.27 (2.35),59.50 (3.25),75.73 (1.02),86.72 (1.02),67.21 (0.39),68.43 (2.10),63.04 (3.56),82.89 (0.32),77.94 (1.23)


In [19]:
# change order from
# amh	arq	ary	eng	esp	hau	kin	mar	tel
# to 
# arq   & amh   & eng   & hau   & kin   & mar   & ary   & esp   & tel
columns = [
    "arq", "amh", "eng", "hau", "kin", "mar", "ary", "esp", "tel"
]

final_df = final_df[columns]
final_df
print(final_df.to_latex(float_format="%.2f"))

\begin{tabular}{llllllllll}
\toprule
 & arq & amh & eng & hau & kin & mar & ary & esp & tel \\
\midrule
e5-multi-notrain & 50.41 (2.82) & 75.86 (1.88) & 80.72 (0.87) & 52.38 (1.93) & 46.20 (5.30) & 77.00 (1.33) & 36.03 (1.59) & 60.30 (1.40) & 75.28 (1.49) \\
e5-all & 59.45 (2.34) & 84.52 (0.88) & 86.43 (0.55) & 69.01 (0.19) & 69.08 (3.43) & 84.62 (1.35) & 81.20 (1.44) & 67.16 (2.44) & 80.14 (0.97) \\
e5-lang & 59.50 (3.25) & 82.27 (2.35) & 86.72 (1.02) & 68.43 (2.10) & 63.04 (3.56) & 82.89 (0.32) & 75.73 (1.02) & 67.21 (0.39) & 77.94 (1.23) \\
\bottomrule
\end{tabular}



In [None]:
# split the df into dev and test, based on the last prefix
# index = "base lang".split()
index = "base all lang".split()

dev_df = final_df[final_df.index.str.contains("dev")]
dev_df.index = index

test_df = final_df[final_df.index.str.contains("test")]
test_df.index = index

In [None]:
print(dev_df.to_latex(float_format="%.2f"))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & arq & amh & eng & hau & kin & mar & ary & esp & tel \\
\midrule
base & 26.80 & -11.65 & -12.57 & -20.03 & 5.33 & -1.60 & -48.28 & -6.52 & -33.06 \\
all & 47.51 & 75.95 & 84.72 & 68.00 & 65.11 & 84.78 & 85.45 & 71.88 & 83.40 \\
lang & 52.03 & 77.05 & 85.28 & 66.34 & 33.10 & 75.66 & 79.70 & 72.24 & 77.83 \\
\bottomrule
\end{tabular}



In [None]:
print(test_df.to_latex(float_format="%.2f"))

\begin{tabular}{lrrrrrrrrr}
\toprule
 & arq & amh & eng & hau & kin & mar & ary & esp & tel \\
\midrule
base & 8.76 & -30.51 & -13.80 & -3.54 & -5.22 & 9.43 & -51.00 & -3.67 & -15.27 \\
all & 54.73 & 86.42 & 85.83 & 75.37 & 71.11 & 86.81 & 76.68 & 63.45 & 80.23 \\
lang & 55.60 & 77.96 & 86.21 & 72.90 & 44.36 & 81.08 & 79.81 & 64.35 & 75.31 \\
\bottomrule
\end{tabular}

