In [1]:
%load_ext autoreload
%autoreload 2

# XLM-R 

# Initial multilingual baseline (no training)

In [2]:
from models import models
from util import get_data
model_name = models["multilingual"]
model_name

  from tqdm.autonotebook import tqdm, trange


'FacebookAI/xlm-roberta-base'

In [3]:
from sklearn.model_selection import KFold

def get_kfold(lang, nsplits=5):
    train = get_data(lang=lang, train=True)
    kFold=KFold(n_splits=nsplits,random_state=42,shuffle=True)
    split_dfs = []
    for train_index, test_index in kFold.split(train):
        train_df = train.iloc[train_index]
        test_df = train.iloc[test_index]
        split_dfs.append({
            "train": train_df,
            "test": test_df
        })
    return split_dfs

kfold_dfs = get_kfold("eng")
kfold_dfs[0]["test"].head()

Unnamed: 0,PairID,Score,s1,s2
8,ENG-train-0008,1.0,Your parents do not have to like your boyfrien...,"your parents dont have to like your bf, you do."
12,ENG-train-0012,1.0,"You might find out later, that you lost the pe...",u might find out later you lost the perfect one.
15,ENG-train-0015,1.0,Guys can be weird. I should know.,Guys can be wierd; I should know.
17,ENG-train-0017,1.0,Fritz Laband was a German footballer .,Fritz Laband is a former German football player .
23,ENG-train-0023,1.0,"If you happen to see me before I return, pleas...",If you should see me before I get back...pleas...


In [4]:
from pair_encoder.model import PairEncoder

model = PairEncoder(
    model_name=model_name, max_length=200, device="cuda", seed=42
)

Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/xlm-roberta-base and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
from util import get_langs
langs = get_langs()
langs

['amh', 'arq', 'ary', 'eng', 'esp', 'hau', 'kin', 'mar', 'tel']

In [6]:
from pair_encoder.evaluation import (
    CorrelationEvaluator,
    get_correlation,
)
import pandas as pd
from tqdm.notebook import tqdm
import numpy as np

from util import get_data, get_pairs, eval_and_submit


def eval_lang(model, df_test):
    return get_correlation(test=get_pairs(df_test), pair_encoder=model)

from pair_encoder import train_encoder


def eval_kfold(model, train, lang=None, fit=False, nsplits=5):
    kFold = KFold(n_splits=nsplits, random_state=42, shuffle=True)
    correlations = []
    for train_index, test_index in tqdm(kFold.split(train), total=nsplits):
        train_df = train.iloc[train_index]
        encoder = None
        if fit:
            encoder, history = train_encoder(
                train_samples=get_pairs(train_df),
                upscaling_samples=None,
                model_name=model_name,
                similarity_model=None,
                batch_size=32,
                learning_rate=2e-5,
                max_grad_norm=1.0,
                epochs=2,
                eval_steps=0,
                max_length=200,
                k=0,
                weak_training_epochs=2,  # used if k > 0
                seed=42,
                verbose=True,
                device="cuda"
            )
        test_df = train.iloc[test_index]
        if lang and "language" in test_df.columns:
            langs = [lang]
            tmpsize = len(test_df)
            test_df = test_df[test_df["language"].isin(langs)]
            newsize = len(test_df)
            print(f"Reduced test set from {tmpsize} to {newsize} to match {lang}")

        if fit:
            correlation = eval_lang(encoder, test_df)
        else:
            correlation = eval_lang(model, test_df)
        correlations.append(correlation)
    mean_correlation = np.mean(correlations)
    std_correlation = np.std(correlations)
    return mean_correlation, std_correlation



In [None]:
baseline_no_train = {}
for lang in tqdm(langs):
    train = get_data(lang=lang, train=True)
    mean_correlation, std_correlation = eval_kfold(
        model, train, lang=None, nsplits=5
    )  # we're only fetching language data to begin with
    baseline_no_train[lang] = (mean_correlation, std_correlation)

In [8]:
baseline_no_train

{'amh': (-0.10748305201014174, 0.07669711348214014),
 'arq': (-0.026382370849709902, 0.05068177088865072),
 'ary': (-0.18984242375081886, 0.04879139408238526),
 'eng': (-0.15489859894399388, 0.033273925759626864),
 'esp': (0.007996704266400315, 0.04941575521451324),
 'hau': (-0.04180103964122549, 0.06069975855303336),
 'kin': (0.010304412975683698, 0.03896655639164298),
 'mar': (-0.07649096988779235, 0.07489661421882177),
 'tel': (-0.11982415619942152, 0.09324279592832001)}

In [9]:
def process_df(data, prefix=""):
    df = pd.DataFrame(data).T
    df.columns = ["mean", "std"]
    df = df * 100
    df["mean"] = df["mean"].map(lambda x: f"{x:.2f}")
    df["std"] = df["std"].map(lambda x: f"({x:.2f})")
    df[f"{prefix}"] = df["mean"] + " " + df["std"]
    df = df.drop(columns=["mean", "std"])
    return df.T

df_notrain = process_df(baseline_no_train, prefix="XLMR-notrain")
df_notrain

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
XLMR-notrain,-10.75 (7.67),-2.64 (5.07),-18.98 (4.88),-15.49 (3.33),0.80 (4.94),-4.18 (6.07),1.03 (3.90),-7.65 (7.49),-11.98 (9.32)


In [10]:
baseline_no_train

{'amh': (-0.10748305201014174, 0.07669711348214014),
 'arq': (-0.026382370849709902, 0.05068177088865072),
 'ary': (-0.18984242375081886, 0.04879139408238526),
 'eng': (-0.15489859894399388, 0.033273925759626864),
 'esp': (0.007996704266400315, 0.04941575521451324),
 'hau': (-0.04180103964122549, 0.06069975855303336),
 'kin': (0.010304412975683698, 0.03896655639164298),
 'mar': (-0.07649096988779235, 0.07489661421882177),
 'tel': (-0.11982415619942152, 0.09324279592832001)}

# All languages

In [None]:
import pandas as pd

# get all training data
df = pd.read_csv("data/train.csv")
print(df.shape)
df.head()

baseline_all = {}

for lang in langs:
    print(lang)
    mean_correlation, std_correlation = eval_kfold(model, df, lang, fit=True, nsplits=5)
    baseline_all[lang] = (mean_correlation, std_correlation)

In [12]:
df_all = process_df(baseline_all, prefix="XLMR-all")
df_all

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
XLMR-all,84.56 (1.53),58.23 (5.30),82.01 (3.04),83.63 (1.36),64.73 (3.23),72.25 (0.66),59.70 (4.09),83.44 (2.56),77.96 (3.87)


In [19]:
df_all = process_df(baseline_all, prefix="XLMR-all")
df_all

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
XLMR-all,84.56 (1.53),58.23 (5.30),82.01 (3.04),83.63 (1.36),64.73 (3.23),72.25 (0.66),59.70 (4.09),83.44 (2.56),77.96 (3.87)


# Train per language

In [None]:
baseline_lang = {}
for lang in langs:
    print(lang)
    df = get_data(lang=lang, train=True)
    mean_correlation, std_correlation = eval_kfold(model, df, lang, fit=True, nsplits=5)
    baseline_lang[lang] = (mean_correlation, std_correlation)

In [18]:
df_lang = process_df(baseline_lang, prefix="XLMR-lang")
df_lang

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
XLMR-lang,73.22 (3.02),39.03 (4.49),69.14 (4.16),83.27 (0.89),58.72 (8.00),63.57 (1.96),31.40 (7.34),74.31 (3.02),71.40 (3.99)


In [20]:
final_df = pd.concat([df_notrain, df_all, df_lang], axis=0)
final_df

Unnamed: 0,amh,arq,ary,eng,esp,hau,kin,mar,tel
XLMR-notrain,-10.75 (7.67),-2.64 (5.07),-18.98 (4.88),-15.49 (3.33),0.80 (4.94),-4.18 (6.07),1.03 (3.90),-7.65 (7.49),-11.98 (9.32)
XLMR-all,84.56 (1.53),58.23 (5.30),82.01 (3.04),83.63 (1.36),64.73 (3.23),72.25 (0.66),59.70 (4.09),83.44 (2.56),77.96 (3.87)
XLMR-lang,73.22 (3.02),39.03 (4.49),69.14 (4.16),83.27 (0.89),58.72 (8.00),63.57 (1.96),31.40 (7.34),74.31 (3.02),71.40 (3.99)


In [21]:
# change order from
# amh	arq	ary	eng	esp	hau	kin	mar	tel
# to 
# arq   & amh   & eng   & hau   & kin   & mar   & ary   & esp   & tel
columns = [
    "arq", "amh", "eng", "hau", "kin", "mar", "ary", "esp", "tel"
]

final_df = final_df[columns]
final_df
print(final_df.to_latex(float_format="%.2f"))

\begin{tabular}{llllllllll}
\toprule
 & arq & amh & eng & hau & kin & mar & ary & esp & tel \\
\midrule
XLMR-notrain & -2.64 (5.07) & -10.75 (7.67) & -15.49 (3.33) & -4.18 (6.07) & 1.03 (3.90) & -7.65 (7.49) & -18.98 (4.88) & 0.80 (4.94) & -11.98 (9.32) \\
XLMR-all & 58.23 (5.30) & 84.56 (1.53) & 83.63 (1.36) & 72.25 (0.66) & 59.70 (4.09) & 83.44 (2.56) & 82.01 (3.04) & 64.73 (3.23) & 77.96 (3.87) \\
XLMR-lang & 39.03 (4.49) & 73.22 (3.02) & 83.27 (0.89) & 63.57 (1.96) & 31.40 (7.34) & 74.31 (3.02) & 69.14 (4.16) & 58.72 (8.00) & 71.40 (3.99) \\
\bottomrule
\end{tabular}

