# Preparation

In [None]:
import torch
import sacrebleu
import pickle
import glob
import itertools
import numpy as np
import statistics
from tqdm import tqdm
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.tree import *
from sklearn.metrics import explained_variance_score
from numpy.random import randn
from numpy import mean
from numpy import std
from scipy import stats

In [None]:
def convert_bleu(spbleu_data):
    spbleu_dict = dict()
    for i,k in enumerate(["bleu_max1", "bleu_max2", "bleu_max3", "bleu_max4"]):
        for attr in ["score","counts", "totals"]:
            if attr in ["score", "counts", "totals"]:
                spbleu_dict[f"{k}-score"] = getattr(spbleu_data[k],"precisions")[i]
                spbleu_dict[f"{k}-totals"] = getattr(spbleu_data[k],"totals")[i]
                spbleu_dict[f"{k}-counts"] = getattr(spbleu_data[k],"counts")[i]
            else:
                spbleu_dict[f"{k}-{attr}"] = getattr(spbleu_data[k],attr)
            if i == 3:
                try:
                    spbleu_dict[f"bleu-ratio"] = getattr(spbleu_data["bleu_max4"],"ratio")
                except:
                    if spbleu_data["bleu_max4"].ref_len:
                        spbleu_dict[f"bleu-ratio"] = spbleu_data["bleu_max4"].sys_len / spbleu_data["bleu_max4"].ref_len
                    else:
                        spbleu_dict[f"bleu-ratio"] = 0
                spbleu_dict[f"bleu_final-score"] = getattr(spbleu_data["bleu_max4"],"score")
                spbleu_dict[f"bleu-bp"] = getattr(spbleu_data["bleu_max4"],"bp")
                spbleu_dict[f"bleu-ref_len"] = getattr(spbleu_data["bleu_max4"],"ref_len")
                spbleu_dict[f"bleu-sys_len"] = getattr(spbleu_data["bleu_max4"],"sys_len")
    return spbleu_dict

def convert_spbleu(spbleu_data):
    spbleu_dict = dict()
    for i,k in enumerate(["spbleu_max1", "spbleu_max2", "spbleu_max3", "spbleu_max4"]):
        for attr in ["score","counts", "totals"]:
            if attr in ["score", "counts", "totals"]:
                spbleu_dict[f"{k}-score"] = getattr(spbleu_data[k],"precisions")[i]
                spbleu_dict[f"{k}-totals"] = getattr(spbleu_data[k],"totals")[i]
                spbleu_dict[f"{k}-counts"] = getattr(spbleu_data[k],"counts")[i]
            else:
                spbleu_dict[f"{k}-{attr}"] = getattr(spbleu_data[k],attr)
            if i == 3:
                try:
                    spbleu_dict[f"spbleu-ratio"] = getattr(spbleu_data["spbleu_max4"],"ratio")
                except:
                    if spbleu_data["spbleu_max4"].ref_len:
                        spbleu_dict[f"spbleu-ratio"] = spbleu_data["spbleu_max4"].sys_len / spbleu_data["spbleu_max4"].ref_len
                    else:
                        spbleu_dict[f"spbleu-ratio"] = 0
                spbleu_dict[f"spbleu_final-score"] = getattr(spbleu_data["spbleu_max4"],"score")
                spbleu_dict[f"spbleu-bp"] = getattr(spbleu_data["spbleu_max4"],"bp")
                spbleu_dict[f"spbleu-ref_len"] = getattr(spbleu_data["spbleu_max4"],"ref_len")
                spbleu_dict[f"spbleu-sys_len"] = getattr(spbleu_data["spbleu_max4"],"sys_len")
    return spbleu_dict

def convert_chrf(chrf_data):
    chrf_dict = dict()
    for k in ["spbleu_max1", "spbleu_max2", "spbleu_max3", "spbleu_max4", "spbleu_max5", "spbleu_max6"]:
        for attr in ["score"]:
            tmp_k = k.replace("spbleu","chrf")
            chrf_dict[f"{tmp_k}-{attr}"] = getattr(chrf_data[k],attr)
    return chrf_dict

def convert_ter(ter_data):
    ter_dict = dict()
    ter_dict[f"ter-score"] = getattr(ter_data["ter"],"score")
    return ter_dict

def convert_bertscore(bertscore_data):
    bertscore_dict = dict()
    for attr in ["score"]:
        bertscore_dict[f"bertscore-p"] = bertscore_data["bertscore"][0].mean().item()*100
        bertscore_dict[f"bertscore-r"] = bertscore_data["bertscore"][1].mean().item()*100
        bertscore_dict[f"bertscore-f"] = bertscore_data["bertscore"][1].mean().item()*100
    return bertscore_dict

READ_FEATURE = {
    "bleu":convert_bleu,
    "spbleu":convert_spbleu,
    "chrf":convert_chrf,
    "ter":convert_ter,
    "bertscore":convert_bertscore
}

In [None]:
def train_multilinear_regressor(x, y):
    regressor = LinearRegression()
    regressor.fit(x,y)
    return regressor

In [None]:

def evaluate_regressor(x, y, regressor):
    results = dict()
    if type(regressor) in [Ridge, LinearRegression,Lasso,ElasticNet,GradientBoostingRegressor,DecisionTreeRegressor,RandomForestRegressor]:
        x = list(x)
        y_pred = regressor.predict(x).tolist()
        y_pred_mean = np.array(y_pred).mean()
        
    else:
        try:
            x =  torch.tensor(x).t()
            y_pred_mean = regressor(x).mean().cpu().detach().numpy()
        except:
            x =  torch.tensor(x)
            y_pred_mean = regressor(x).mean().cpu().detach().numpy()
        
    y_mean = np.array(y).mean()
    return (y_pred_mean, y_mean)

# Read Data (Need Self-modification)

In [None]:
def read_all_data(KS, dataset="local_flores_101/*",model_name="mbart50-m2m", split="train"):
    lang, x1, x2, y = [], [], [], []
    x = []
    print(f"{dataset}/{model_name}/")
    all_folders = glob.glob(f"{dataset}/{model_name}/")
    all_data_groups = [[f_name for f_name in glob.glob(f"{folder}/*") if split in f_name] 
                       for folder in all_folders]
    for group_name in all_data_groups:
        try:
            if group_name and len(group_name)>=4:
                tmp_data = dict()
                for prefix in ["self_src", "self_tgt", "trans_from_src", "trans_from_tgt"]:
                    suffixs = [ks[0].split("_")[0] for ks in KS]
                    for suffix in suffixs:
                        path = '/'.join(group_name[0].split('\\')[:-1])
                        with open(f"{path}/{split}_{prefix}_corpus_{suffix}.pkl","rb") as f:
                            data = READ_FEATURE[suffix](pickle.load(f))
                            try:
                                tmp_data[f"{prefix}"].update(data)
                            except:
                                tmp_data[f"{prefix}"] = data
                x1.append([tmp_data["self_src"][f"{k1}-{k2}"] 
                           for k1,k2 in KS])
                x2.append([tmp_data["self_tgt"][f"{k1}-{k2}"] 
                           for k1,k2 in KS])
                x2.append([tmp_data["self_src"][f"{k1}-{k2}"] 
                           for k1,k2 in KS])
                x1.append([tmp_data["self_tgt"][f"{k1}-{k2}"] 
                           for k1,k2 in KS])

                y.append(tmp_data["trans_from_tgt"][f"{KS[-1][0]}-{KS[-1][1]}"])
                y.append(tmp_data["trans_from_src"][f"{KS[-1][0]}-{KS[-1][1]}"])

                lang.append(group_name[0].split("\\")[1].split("/")[0])
                lang.append("_".join(reversed(group_name[0].split('\\')[1].split("_"))))
        except:
            pass

    # Choose the way you want to read features
    return x1, y,lang
#     return [xx1+xx2 for xx1,xx2 in zip(x1,x2)], y,lang

# Flores-101

In [None]:
# Constant
# REGION_1 is for the seen languages
# REGION_2 is for the unseen languages
REGION_1 = ['en','es','fr','de','pt','ru','it','nl','tr','pl','zh','ro','el','ja','ta','kk','km','ha','ps','gu']
REGION_2 = ['lv','hi','jv','is','az','hy', 'cs','fi','bg','lt','et','ur','my']

In [None]:
# Define features you want to read
# KS can be in 
#        ["chrf_max1", "chrf_max2", "chrf_max3", "chrf_max4", "chrf_max5", "chrf_max6"]
#         To
#         ["score"]
        
#         or/and
#         ["bleu_max1", "bleu_max2", "bleu_max3", "bleu_max4"]
#         To
#         ["score","counts", "totals"]
        
#         or/and
#         ["spbleu_max1", "spbleu_max2", "spbleu_max3", "spbleu_max4"]
#         To
#         ["score","counts", "totals"]
        
#         or/and
#         ["bleu_final"]
#         To
#         ["score","bp","ref_len","sys_len"]
        
#         or/and
#         ["spbleu_final"]
#         To
#         ["score","bp","ref_len","sys_len"]
        
#         or/and
#         ["bertscore"]
#         To
#         ["p","r",f""]

KS = [("spbleu_final","score")]
# model_name can be in ["mbart50-m2m", "m2m-100-base", "m2m-100-large", "google_drive"]
model_n = "m2m-100-base"

In [None]:
# Train Linear Regression Model
train_x, train_y,l = read_all_data(KS,dataset="local_flores_101/*", split="train")
train_x = [x for x,ll in zip(train_x,l) if all(lll in REGION_1 for lll in ll.split("_"))]
train_y = [x for x,ll in zip(train_y,l) if all(lll in REGION_1 for lll in ll.split("_"))]
model = train_multilinear_regressor(train_x, train_y)

In [None]:
# Test Based On Your Need


# Example 1: Region 1 data
test_x, test_y,l = read_all_data(KS,dataset="local_flores_101/*",split="test")
test_x = [x for x,ll in zip(test_x,l) if tuple(ll.split("_")) in itertools.product(REGION_1,REGION_1)]
test_y = [y for y,ll in zip(test_y,l) if tuple(ll.split("_")) in itertools.product(REGION_1,REGION_1)]
l = [ll for ll in l if tuple(ll.split("_")) in itertools.product(REGION_1,REGION_1)]

# Example 2: Region 2 data
# test_x, test_y,l = read_all_data(KS,dataset="../local_flores_101/*",model_name=model_n, split="test")
# test_x = [x for x,ll in zip(test_x,l) if tuple(ll.split("_")) in itertools.product(REGION_2,REGION_2)]
# test_y = [y for y,ll in zip(test_y,l) if tuple(ll.split("_")) in itertools.product(REGION_2,REGION_2)]
# l = [ll for ll in l if tuple(ll.split("_")) in itertools.product(REGION_2,REGION_2)]

In [None]:
avg_mae = []
pres = []

model = train_multilinear_regressor(train_x, train_y)
for tx, ty,ll in zip(test_x, test_y,l): 

    res = evaluate_regressor([tx], [ty], model)
    print(ll)
    print(max(res[0],0),res[1])
    print()
        
    avg_mae.append(abs(max(res[0],0)-res[1]))
    pres.append(max(res[0],0))
print(mean(avg_mae))
print(np.sqrt(np.mean((np.array(pres)-np.array(test_y))**2)))
print(len([m for m in avg_mae if m <= 2]), len(avg_mae))
print(stats.pearsonr(pres, test_y))