# Results inspection
Takes the results from the 5 sub-sample validations and averages + computes the standard deviation of the predictions.

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
import json
from collections import defaultdict
from typing import Literal
import os

model_types = {
    "single": "singlemodel",
    "token": "tokenmodel",
}

# model_type = model_types["single"]
model_type = model_types["token"]
print(model_type)

# fetch all the related folders (dataset_{single/token}model.N)
datasets = {
    "europarl": "europarl",
    "opensub": "opensub",
    "tatoeba": "tatoeba",
}

dataset = datasets["opensub"]

RESULTS_FOLDER = "results_1000samples_5bootstrap"

folder_id = f"{dataset}_{model_type}"
folders = [f for f in os.listdir(os.path.join("..", RESULTS_FOLDER, dataset)) if dataset in f]
folders = sorted(folders, key=lambda x: int(x.split(".")[-1]))
print(folders)
def get_combined(_type: Literal["metrics", "translations"]="metrics"):
    combined = {}
    for folder in folders:
        if model_type not in folder:
            continue
        print(f"Fetching from {folder}")
        folder = os.path.join("..", RESULTS_FOLDER, dataset, folder)
        for files in sorted(os.listdir(folder)):
            if _type not in files:
                continue
            lang = files.split(".")[0]
            if lang not in combined:
                combined[lang] = {}
            
            with open(f"{folder}/{files}") as f:
                data = json.load(f)
                # baseline, 0.5, 0.6, ..., 1.0

                if _type == "metrics":
                    for model_key, metric_values in data.items():
                        if model_key not in combined[lang]:
                            combined[lang][model_key] = defaultdict(list)

                        for metric_key, metric_value in metric_values.items():
                            combined[lang][model_key][metric_key].append(metric_value)
                if _type == "translations":
                    for model_key, translation_values in data.items():
                        combined[lang][model_key] = translation_values
    return combined

all_metrics = get_combined("metrics")
all_transl = get_combined("translations")

tokenmodel
['opensub_tokenmodel.0', 'opensub_singlemodel.0', 'opensub_tokenmodel.1', 'opensub_singlemodel.1', 'opensub_tokenmodel.2', 'opensub_singlemodel.2', 'opensub_tokenmodel.3', 'opensub_singlemodel.3', 'opensub_tokenmodel.4', 'opensub_singlemodel.4']
Fetching from opensub_tokenmodel.0
Fetching from opensub_tokenmodel.1
Fetching from opensub_tokenmodel.2
Fetching from opensub_tokenmodel.3
Fetching from opensub_tokenmodel.4
Fetching from opensub_tokenmodel.0
Fetching from opensub_tokenmodel.1
Fetching from opensub_tokenmodel.2
Fetching from opensub_tokenmodel.3
Fetching from opensub_tokenmodel.4


In [8]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../")
from evaluation_tools import print_metrics
# calc the mean + std for each list of the N metric values

ADD_STD = False
model_types_to_visualize = ["baseline", "0.5", "0.7", "1.0"]
ignored_metrics = ["chrf++", "rl",]

presentable_metrics = {lang: {} for lang in all_metrics.keys()}
for lang, metrics in all_metrics.items():
    for model_type, model_values in metrics.items():
        # model type is the compressoin level
        if model_type not in model_types_to_visualize:
            continue
        # print(model_type)
        presentable_metrics[lang][model_type] = {}
        for metrics, metric_values in model_values.items():
            if metrics in ignored_metrics:
                continue
            values = np.array(metric_values)
            mean = np.mean(values)
            std = np.std(values)
            std_as_percentage = std / mean * 100
            #value_str = f"{mean:.2f} ± {std:.3f}"
            if ADD_STD:
                value_str = f"{mean:.2f} ({std:.2f})"
                #presentable_metrics[lang][model_type][metrics] = (mean.round(2), std.round(3))
                presentable_metrics[lang][model_type][metrics] = value_str
            else:
                presentable_metrics[lang][model_type][metrics] = float(mean.round(2))
                

dataframes = {}

for lang, metrics in presentable_metrics.items():
    dataframes[lang] = pd.DataFrame(metrics)
print(dataframes.keys())

dfs = []
for lang in dataframes.keys():
    df = print_metrics(_lang=lang, _metrics=dataframes[lang], std=ADD_STD, transpose=False, just_return=True)
    # for all the indexes, prefix language
    df.index = [f"{lang} {i}" for i in df.index]
    print(df.index)
    dfs.append(df.T)

dict_keys(['eu', 'fr', 'hu', 'lt', 'ms', 'no', 'pl', 'sq'])
Index(['eu baseline', 'eu 0.5', 'eu 0.7', 'eu 1.0'], dtype='object')
Index(['fr baseline', 'fr 0.5', 'fr 0.7', 'fr 1.0'], dtype='object')
Index(['hu baseline', 'hu 0.5', 'hu 0.7', 'hu 1.0'], dtype='object')
Index(['lt baseline', 'lt 0.5', 'lt 0.7', 'lt 1.0'], dtype='object')
Index(['ms baseline', 'ms 0.5', 'ms 0.7', 'ms 1.0'], dtype='object')
Index(['no baseline', 'no 0.5', 'no 0.7', 'no 1.0'], dtype='object')
Index(['pl baseline', 'pl 0.5', 'pl 0.7', 'pl 1.0'], dtype='object')
Index(['sq baseline', 'sq 0.5', 'sq 0.7', 'sq 1.0'], dtype='object')


In [9]:
columns = "BERT ROUGE-1 ROUGE-2 ChrF METEOR BERTScore Length".split()
megadf = pd.concat(dfs, axis=1).T
megadf.columns = columns
print(megadf.to_latex(float_format="%.2f"))
megadf


\begin{tabular}{lrrrrrrr}
\toprule
 & BERT & ROUGE-1 & ROUGE-2 & ChrF & METEOR & BERTScore & Length \\
\midrule
eu baseline & 2.77 & 0.19 & 0.03 & 24.35 & 0.24 & 0.72 & 3.18 \\
eu 0.5 & 6.44 & 0.27 & 0.06 & 26.79 & 0.30 & 0.75 & 2.46 \\
eu 0.7 & 5.42 & 0.26 & 0.05 & 26.97 & 0.30 & 0.75 & 2.60 \\
eu 1.0 & 3.80 & 0.24 & 0.04 & 26.31 & 0.28 & 0.74 & 2.90 \\
fr baseline & 3.97 & 0.21 & 0.08 & 25.16 & 0.29 & 0.71 & 3.28 \\
fr 0.5 & 9.39 & 0.32 & 0.12 & 28.71 & 0.35 & 0.77 & 1.65 \\
fr 0.7 & 6.66 & 0.29 & 0.11 & 27.91 & 0.33 & 0.75 & 2.17 \\
fr 1.0 & 4.96 & 0.25 & 0.10 & 26.89 & 0.32 & 0.73 & 2.72 \\
hu baseline & 3.07 & 0.22 & 0.09 & 23.42 & 0.26 & 0.69 & 3.51 \\
hu 0.5 & 10.07 & 0.30 & 0.13 & 25.97 & 0.32 & 0.76 & 1.60 \\
hu 0.7 & 6.19 & 0.27 & 0.11 & 24.93 & 0.29 & 0.75 & 2.15 \\
hu 1.0 & 4.52 & 0.24 & 0.10 & 24.11 & 0.28 & 0.73 & 2.73 \\
lt baseline & 2.42 & 0.17 & 0.05 & 20.83 & 0.25 & 0.69 & 3.76 \\
lt 0.5 & 7.59 & 0.26 & 0.08 & 24.33 & 0.31 & 0.77 & 1.76 \\
lt 0.7 & 5.33 & 0.24 & 0.08

Unnamed: 0,BERT,ROUGE-1,ROUGE-2,ChrF,METEOR,BERTScore,Length
eu baseline,2.77,0.19,0.03,24.35,0.24,0.72,3.18
eu 0.5,6.44,0.27,0.06,26.79,0.3,0.75,2.46
eu 0.7,5.42,0.26,0.05,26.97,0.3,0.75,2.6
eu 1.0,3.8,0.24,0.04,26.31,0.28,0.74,2.9
fr baseline,3.97,0.21,0.08,25.16,0.29,0.71,3.28
fr 0.5,9.39,0.32,0.12,28.71,0.35,0.77,1.65
fr 0.7,6.66,0.29,0.11,27.91,0.33,0.75,2.17
fr 1.0,4.96,0.25,0.1,26.89,0.32,0.73,2.72
hu baseline,3.07,0.22,0.09,23.42,0.26,0.69,3.51
hu 0.5,10.07,0.3,0.13,25.97,0.32,0.76,1.6


In [10]:
import sys
sys.path.append("../")
from evaluation_tools import print_metrics

for lang in dataframes.keys():
    print_metrics(_lang=lang, _metrics=dataframes[lang], std=ADD_STD, transpose=False)

Unnamed: 0,bleu,r1,r2,chrF,meteor,bert_f1,len_ratio,score,comp_score
baseline,2.77,0.19,0.03,24.35,0.24,0.72,3.18,52,43
0.5,6.44,0.27,0.06,26.79,0.3,0.75,2.46,67,67
0.7,5.42,0.26,0.05,26.97,0.3,0.75,2.6,64,61
1.0,3.8,0.24,0.04,26.31,0.28,0.74,2.9,59,51



\begin{tabular}{lrrrrrrrrr}
\toprule
 & bleu & r1 & r2 & chrF & meteor & bert_f1 & len_ratio & score & comp_score \\
\midrule
baseline & 2.77 & 0.19 & 0.03 & 24.35 & 0.24 & 0.72 & 3.18 & 52 & 43 \\
0.5 & 6.44 & 0.27 & 0.06 & 26.79 & 0.30 & 0.75 & 2.46 & 67 & 67 \\
0.7 & 5.42 & 0.26 & 0.05 & 26.97 & 0.30 & 0.75 & 2.60 & 64 & 61 \\
1.0 & 3.80 & 0.24 & 0.04 & 26.31 & 0.28 & 0.74 & 2.90 & 59 & 51 \\
\bottomrule
\end{tabular}



Unnamed: 0,bleu,r1,r2,chrF,meteor,bert_f1,len_ratio,score,comp_score
baseline,3.97,0.21,0.08,25.16,0.29,0.71,3.28,53,31
0.5,9.39,0.32,0.12,28.71,0.35,0.77,1.65,65,65
0.7,6.66,0.29,0.11,27.91,0.33,0.75,2.17,60,47
1.0,4.96,0.25,0.1,26.89,0.32,0.73,2.72,57,38



\begin{tabular}{lrrrrrrrrr}
\toprule
 & bleu & r1 & r2 & chrF & meteor & bert_f1 & len_ratio & score & comp_score \\
\midrule
baseline & 3.97 & 0.21 & 0.08 & 25.16 & 0.29 & 0.71 & 3.28 & 53 & 31 \\
0.5 & 9.39 & 0.32 & 0.12 & 28.71 & 0.35 & 0.77 & 1.65 & 65 & 65 \\
0.7 & 6.66 & 0.29 & 0.11 & 27.91 & 0.33 & 0.75 & 2.17 & 60 & 47 \\
1.0 & 4.96 & 0.25 & 0.10 & 26.89 & 0.32 & 0.73 & 2.72 & 57 & 38 \\
\bottomrule
\end{tabular}



Unnamed: 0,bleu,r1,r2,chrF,meteor,bert_f1,len_ratio,score,comp_score
baseline,3.07,0.22,0.09,23.42,0.26,0.69,3.51,53,29
0.5,10.07,0.3,0.13,25.97,0.32,0.76,1.6,64,64
0.7,6.19,0.27,0.11,24.93,0.29,0.75,2.15,58,44
1.0,4.52,0.24,0.1,24.11,0.28,0.73,2.73,55,35



\begin{tabular}{lrrrrrrrrr}
\toprule
 & bleu & r1 & r2 & chrF & meteor & bert_f1 & len_ratio & score & comp_score \\
\midrule
baseline & 3.07 & 0.22 & 0.09 & 23.42 & 0.26 & 0.69 & 3.51 & 53 & 29 \\
0.5 & 10.07 & 0.30 & 0.13 & 25.97 & 0.32 & 0.76 & 1.60 & 64 & 64 \\
0.7 & 6.19 & 0.27 & 0.11 & 24.93 & 0.29 & 0.75 & 2.15 & 58 & 44 \\
1.0 & 4.52 & 0.24 & 0.10 & 24.11 & 0.28 & 0.73 & 2.73 & 55 & 35 \\
\bottomrule
\end{tabular}



Unnamed: 0,bleu,r1,r2,chrF,meteor,bert_f1,len_ratio,score,comp_score
baseline,2.42,0.17,0.05,20.83,0.25,0.69,3.76,51,29
0.5,7.59,0.26,0.08,24.33,0.31,0.77,1.76,64,64
0.7,5.33,0.24,0.08,23.71,0.29,0.75,2.29,61,48
1.0,3.91,0.22,0.06,22.56,0.28,0.73,2.77,56,38



\begin{tabular}{lrrrrrrrrr}
\toprule
 & bleu & r1 & r2 & chrF & meteor & bert_f1 & len_ratio & score & comp_score \\
\midrule
baseline & 2.42 & 0.17 & 0.05 & 20.83 & 0.25 & 0.69 & 3.76 & 51 & 29 \\
0.5 & 7.59 & 0.26 & 0.08 & 24.33 & 0.31 & 0.77 & 1.76 & 64 & 64 \\
0.7 & 5.33 & 0.24 & 0.08 & 23.71 & 0.29 & 0.75 & 2.29 & 61 & 48 \\
1.0 & 3.91 & 0.22 & 0.06 & 22.56 & 0.28 & 0.73 & 2.77 & 56 & 38 \\
\bottomrule
\end{tabular}



Unnamed: 0,bleu,r1,r2,chrF,meteor,bert_f1,len_ratio,score,comp_score
baseline,1.18,0.14,0.02,17.1,0.22,0.71,3.69,43,29
0.5,5.29,0.27,0.06,26.29,0.32,0.77,2.21,65,65
0.7,4.89,0.26,0.06,26.4,0.31,0.76,2.48,65,58
1.0,3.98,0.25,0.05,25.85,0.3,0.75,2.87,61,49



\begin{tabular}{lrrrrrrrrr}
\toprule
 & bleu & r1 & r2 & chrF & meteor & bert_f1 & len_ratio & score & comp_score \\
\midrule
baseline & 1.18 & 0.14 & 0.02 & 17.10 & 0.22 & 0.71 & 3.69 & 43 & 29 \\
0.5 & 5.29 & 0.27 & 0.06 & 26.29 & 0.32 & 0.77 & 2.21 & 65 & 65 \\
0.7 & 4.89 & 0.26 & 0.06 & 26.40 & 0.31 & 0.76 & 2.48 & 65 & 58 \\
1.0 & 3.98 & 0.25 & 0.05 & 25.85 & 0.30 & 0.75 & 2.87 & 61 & 49 \\
\bottomrule
\end{tabular}



Unnamed: 0,bleu,r1,r2,chrF,meteor,bert_f1,len_ratio,score,comp_score
baseline,3.71,0.21,0.07,23.82,0.27,0.71,3.46,52,31
0.5,10.47,0.31,0.11,27.86,0.34,0.78,1.76,65,65
0.7,7.36,0.28,0.1,26.95,0.31,0.76,2.33,60,47
1.0,5.65,0.25,0.09,26.11,0.3,0.74,2.86,57,38



\begin{tabular}{lrrrrrrrrr}
\toprule
 & bleu & r1 & r2 & chrF & meteor & bert_f1 & len_ratio & score & comp_score \\
\midrule
baseline & 3.71 & 0.21 & 0.07 & 23.82 & 0.27 & 0.71 & 3.46 & 52 & 31 \\
0.5 & 10.47 & 0.31 & 0.11 & 27.86 & 0.34 & 0.78 & 1.76 & 65 & 65 \\
0.7 & 7.36 & 0.28 & 0.10 & 26.95 & 0.31 & 0.76 & 2.33 & 60 & 47 \\
1.0 & 5.65 & 0.25 & 0.09 & 26.11 & 0.30 & 0.74 & 2.86 & 57 & 38 \\
\bottomrule
\end{tabular}



Unnamed: 0,bleu,r1,r2,chrF,meteor,bert_f1,len_ratio,score,comp_score
baseline,3.59,0.19,0.07,22.01,0.25,0.71,3.06,57,38
0.5,7.25,0.25,0.1,23.38,0.29,0.75,1.88,66,66
0.7,5.09,0.23,0.09,22.75,0.27,0.74,2.35,61,50
1.0,4.33,0.21,0.08,22.7,0.26,0.73,2.69,59,44



\begin{tabular}{lrrrrrrrrr}
\toprule
 & bleu & r1 & r2 & chrF & meteor & bert_f1 & len_ratio & score & comp_score \\
\midrule
baseline & 3.59 & 0.19 & 0.07 & 22.01 & 0.25 & 0.71 & 3.06 & 57 & 38 \\
0.5 & 7.25 & 0.25 & 0.10 & 23.38 & 0.29 & 0.75 & 1.88 & 66 & 66 \\
0.7 & 5.09 & 0.23 & 0.09 & 22.75 & 0.27 & 0.74 & 2.35 & 61 & 50 \\
1.0 & 4.33 & 0.21 & 0.08 & 22.70 & 0.26 & 0.73 & 2.69 & 59 & 44 \\
\bottomrule
\end{tabular}



Unnamed: 0,bleu,r1,r2,chrF,meteor,bert_f1,len_ratio,score,comp_score
baseline,3.39,0.21,0.07,21.81,0.27,0.71,3.5,54,37
0.5,6.96,0.31,0.11,25.35,0.33,0.77,2.2,66,66
0.7,5.62,0.29,0.1,24.39,0.32,0.75,2.58,62,54
1.0,4.7,0.26,0.09,23.89,0.3,0.74,2.87,59,47



\begin{tabular}{lrrrrrrrrr}
\toprule
 & bleu & r1 & r2 & chrF & meteor & bert_f1 & len_ratio & score & comp_score \\
\midrule
baseline & 3.39 & 0.21 & 0.07 & 21.81 & 0.27 & 0.71 & 3.50 & 54 & 37 \\
0.5 & 6.96 & 0.31 & 0.11 & 25.35 & 0.33 & 0.77 & 2.20 & 66 & 66 \\
0.7 & 5.62 & 0.29 & 0.10 & 24.39 & 0.32 & 0.75 & 2.58 & 62 & 54 \\
1.0 & 4.70 & 0.26 & 0.09 & 23.89 & 0.30 & 0.74 & 2.87 & 59 & 47 \\
\bottomrule
\end{tabular}



In [None]:
dataframes.keys()

In [12]:
first_entry = list(all_transl.keys())[0]
translation_keys = all_transl[first_entry].keys()
print(translation_keys)

dict_keys(['source', 'target', 'baseline', '0.5', '0.6', '0.7', '0.8', '0.9', '1.0'])


# Good IDs

## norwegian nob
```
index: 90
source
>>nob<< He is not flesh but spirit. The light of eternal mind.

target
Sinnets evige lys.

baseline
Han er ikke kjøtt, men ånden, lyset til det evige sinnet.

0.5
Det evige sinns lys.

0.6
Det evige sinns lys.

0.7
Det evige sinns lys.

0.8
Han er ikke kjøtt, men ånd, evig sinds lys.

0.9
Han er ikke kjøtt, men ånd, evig sinds lys.

1.0
Han er ikke kjøtt, men ånd, evig sinds lys.

___
index: 279
source
>>nob<< Easy, easy, easy, easy.

target
Rolig...

baseline
Lett, enkelt, enkelt.

0.5
Rolig.
___
index: 843
source
>>nob<< But when he carried you over the threshold, Karl, and he... He laid you down and gently kissed you, didn't he, didn't he say any...

target
Han bar deg over terskelen og kysset deg.

baseline
Men da han bar deg over terskelen, Karl, og han... han la deg ned og forsiktig kysset deg, ikke sant, sa han ikke...

0.5
Men da han bar deg over terskelen, la han deg ned og kysset deg, ikke sant?

0.6
Men da han bar deg over terskelen, la han deg ned og kysset deg forsiktig, sa han ikke...
___


index: 355
source
>>nob<< All right, well...

target
Greit.

baseline
Ok, vel...

0.5
Greit.

0.6
Ok, ok...

0.7
Greit, ja vel...

___


```




In [34]:
import random
def peek(lang: str):
    data = all_transl[lang]
    index = random.randint(0, len(data["source"]) - 1)
    print("index:", index)

    sample = {}
    for key in translation_keys:
        print(key)
        print(data[key][index])
        print()
        sample[key] = data[key][index]
    
    #return sample

lang = "no"
peek(lang)

index: 343
source
>>nob<< Your ankle seems to be fine. Just a mild sprain, so this should help. Great.

target
Åh gud.

baseline
Det ser ut til at ankelen din har det fint, bare en mild fordypning, så dette burde hjelpe.

0.5
Ankelen din virker bra.

0.6
Ankelen din virker bra, så dette hjelper.

0.7
Det er bare en mild forsvinning, så dette burde hjelpe.

0.8
Det ser ut til at ankelen din er fin, så dette hjelper.

0.9
Det ser ut til at ankelen din er fin, så dette hjelper.

1.0
Det ser ut til at ankelen din er fin, så dette hjelper.

