# Results inspection
Takes the results from the 5 sub-sample validations and averages + computes the standard deviation of the predictions.

In [1]:
%load_ext autoreload
%autoreload 2

In [12]:
import json
from collections import defaultdict
from typing import Literal
import os

model_types = {
    "single": "singlemodel",
    "token": "tokenmodel",
}

model_type = model_types["single"]
# model_type = model_types["token"]
print(model_type)

# fetch all the related folders (dataset_{single/token}model.N)
datasets = {
    "europarl": "europarl",
    "opensub": "opensub",
    "tatoeba": "tatoeba",
}

dataset = datasets["opensub"]
dataset = datasets["europarl"]

RESULTS_FOLDER = "results_DEC2024"
RESULTS_FOLDER = "results_FINALXXX"
# RESULTS_FOLDER = "results_ORIGINAL"


folder_id = f"{dataset}_{model_type}"
folders = [f for f in os.listdir(os.path.join("..", RESULTS_FOLDER, dataset)) if dataset in f and ".json" not in f]
folders = sorted(folders, key=lambda x: int(x.split(".")[-1]))
print(folders)
def get_combined(_type: Literal["metrics", "translations"]="metrics"):
    combined = {}
    for folder in folders:
        if model_type not in folder:
            continue
        print(f"Fetching from {folder}")
        folder = os.path.join("..", RESULTS_FOLDER, dataset, folder)
        for files in sorted(os.listdir(folder)):
            if _type not in files:
                continue
            lang = files.split(".")[0]
            if lang not in combined:
                combined[lang] = {}
            
            with open(f"{folder}/{files}") as f:
                data = json.load(f)
                # baseline, 0.5, 0.6, ..., 1.0

                if _type == "metrics":
                    for model_key, metric_values in data.items():
                        if model_key not in combined[lang]:
                            combined[lang][model_key] = defaultdict(list)

                        for metric_key, metric_value in metric_values.items():
                            combined[lang][model_key][metric_key].append(metric_value)
                if _type == "translations":
                    for model_key, translation_values in data.items():
                        combined[lang][model_key] = translation_values
    return combined

all_metrics = get_combined("metrics")
all_transl = get_combined("translations")

singlemodel
['europarl_singlemodel.0', 'europarl_tokenmodel.0', 'europarl_tokenmodel.1', 'europarl_singlemodel.1', 'europarl_tokenmodel.2', 'europarl_singlemodel.2', 'europarl_tokenmodel.3', 'europarl_singlemodel.3', 'europarl_tokenmodel.4', 'europarl_singlemodel.4']
Fetching from europarl_singlemodel.0
Fetching from europarl_singlemodel.1
Fetching from europarl_singlemodel.2
Fetching from europarl_singlemodel.3
Fetching from europarl_singlemodel.4
Fetching from europarl_singlemodel.0
Fetching from europarl_singlemodel.1
Fetching from europarl_singlemodel.2
Fetching from europarl_singlemodel.3
Fetching from europarl_singlemodel.4


In [13]:
import pandas as pd
import numpy as np
import sys
sys.path.append("../")
from evaluation_tools import print_metrics
# calc the mean + std for each list of the N metric values

ADD_STD = True
model_types_to_visualize = ["baseline", "0.5", "0.7", "1.0"]
ignored_metrics = ["chrf++", "rl",]
metric_multipliers = {
    # "bleu": 1,
    # "r1": 100,
    # "r2": 100,
    # "rl": 100,
    # "chrf": 1,
    # "meteor": 100,
    # "bert_f1": 100,
}

presentable_metrics = {lang: {} for lang in all_metrics.keys()}
for lang, metrics in all_metrics.items():
    for model_type, model_values in metrics.items():
        # model type is the compressoin level
        if model_type not in model_types_to_visualize:
            continue
        # print(model_type)
        presentable_metrics[lang][model_type] = {}
        for metrics, metric_values in model_values.items():
            multiplier = metric_multipliers.get(metrics, 1)
            
            if metrics in ignored_metrics:
                continue
            values = np.array(metric_values) * multiplier
            mean = np.mean(values)
            std = np.std(values)
            # std_as_percentage = std / mean * 100
            #value_str = f"{mean:.2f} ± {std:.3f}"
            if ADD_STD:
                value_str = f"{mean:.2f} ({std:.2f})"
                #presentable_metrics[lang][model_type][metrics] = (mean.round(2), std.round(3))
                presentable_metrics[lang][model_type][metrics] = value_str
            else:
                presentable_metrics[lang][model_type][metrics] = float(mean.round(2))
                

dataframes = {}

for lang, metrics in presentable_metrics.items():
    dataframes[lang] = pd.DataFrame(metrics)
print(dataframes.keys())

dfs = []
for lang in dataframes.keys():
    df = print_metrics(_lang=lang, _metrics=dataframes[lang], std=ADD_STD, transpose=False, just_return=True)
    # for all the indexes, prefix language
    df.index = [f"{lang} {i}" for i in df.index]
    print(df.index)
    dfs.append(df.T)

columns = "BERT ROUGE-1 ROUGE-2 ChrF METEOR BERTScore Length".split()
megadf = pd.concat(dfs, axis=1).T
megadf.columns = columns
print(megadf.to_latex(float_format="%.2f"))
megadf

dict_keys(['fr', 'hu', 'lt', 'pl'])
Index(['fr baseline', 'fr 0.5', 'fr 0.7', 'fr 1.0'], dtype='object')
Index(['hu baseline', 'hu 0.5', 'hu 0.7', 'hu 1.0'], dtype='object')
Index(['lt baseline', 'lt 0.5', 'lt 0.7', 'lt 1.0'], dtype='object')
Index(['pl baseline', 'pl 0.5', 'pl 0.7', 'pl 1.0'], dtype='object')
\begin{tabular}{llllllll}
\toprule
 & BERT & ROUGE-1 & ROUGE-2 & ChrF & METEOR & BERTScore & Length \\
\midrule
fr baseline & 36.03 (1.17) & 0.65 (0.01) & 0.46 (0.02) & 63.64 (0.64) & 0.61 (0.01) & 0.88 (0.00) & 1.08 \\
fr 0.5 & 3.26 (0.92) & 0.18 (0.01) & 0.04 (0.01) & 30.44 (1.07) & 0.16 (0.01) & 0.80 (0.01) & 0.95 \\
fr 0.7 & 3.36 (0.88) & 0.18 (0.01) & 0.04 (0.01) & 31.03 (0.78) & 0.16 (0.01) & 0.80 (0.01) & 0.99 \\
fr 1.0 & 3.23 (0.94) & 0.19 (0.01) & 0.04 (0.01) & 31.70 (0.88) & 0.16 (0.01) & 0.80 (0.00) & 1.05 \\
hu baseline & 27.86 (1.79) & 0.64 (0.01) & 0.44 (0.01) & 60.85 (1.14) & 0.54 (0.02) & 0.86 (0.01) & 1.03 \\
hu 0.5 & 1.68 (0.85) & 0.11 (0.01) & 0.03 (0.01) & 18.

Unnamed: 0,BERT,ROUGE-1,ROUGE-2,ChrF,METEOR,BERTScore,Length
fr baseline,36.03 (1.17),0.65 (0.01),0.46 (0.02),63.64 (0.64),0.61 (0.01),0.88 (0.00),1.08
fr 0.5,3.26 (0.92),0.18 (0.01),0.04 (0.01),30.44 (1.07),0.16 (0.01),0.80 (0.01),0.95
fr 0.7,3.36 (0.88),0.18 (0.01),0.04 (0.01),31.03 (0.78),0.16 (0.01),0.80 (0.01),0.99
fr 1.0,3.23 (0.94),0.19 (0.01),0.04 (0.01),31.70 (0.88),0.16 (0.01),0.80 (0.00),1.05
hu baseline,27.86 (1.79),0.64 (0.01),0.44 (0.01),60.85 (1.14),0.54 (0.02),0.86 (0.01),1.03
hu 0.5,1.68 (0.85),0.11 (0.01),0.03 (0.01),18.40 (0.54),0.11 (0.01),0.70 (0.01),1.14
hu 0.7,2.16 (1.02),0.15 (0.01),0.05 (0.01),20.19 (0.72),0.13 (0.01),0.71 (0.00),1.23
hu 1.0,3.62 (1.47),0.21 (0.01),0.08 (0.01),25.64 (1.36),0.17 (0.01),0.71 (0.00),1.38
lt baseline,32.02 (2.33),0.61 (0.01),0.40 (0.01),64.14 (1.20),0.58 (0.01),0.87 (0.00),1.1
lt 0.5,4.57 (0.50),0.17 (0.01),0.06 (0.01),27.92 (0.65),0.18 (0.01),0.76 (0.00),1.05


In [None]:
# import sys
# sys.path.append("../")
# from evaluation_tools import print_metrics

# for lang in dataframes.keys():
#     print_metrics(_lang=lang, _metrics=dataframes[lang], std=False, transpose=False)

In [None]:
dataframes.keys()

In [None]:
first_entry = list(all_transl.keys())[0]
translation_keys = all_transl[first_entry].keys()
print(translation_keys)

# Good IDs

## norwegian nob
```
index: 90
source
>>nob<< He is not flesh but spirit. The light of eternal mind.

target
Sinnets evige lys.

baseline
Han er ikke kjøtt, men ånden, lyset til det evige sinnet.

0.5
Det evige sinns lys.

0.6
Det evige sinns lys.

0.7
Det evige sinns lys.

0.8
Han er ikke kjøtt, men ånd, evig sinds lys.

0.9
Han er ikke kjøtt, men ånd, evig sinds lys.

1.0
Han er ikke kjøtt, men ånd, evig sinds lys.

___
index: 279
source
>>nob<< Easy, easy, easy, easy.

target
Rolig...

baseline
Lett, enkelt, enkelt.

0.5
Rolig.
___
index: 843
source
>>nob<< But when he carried you over the threshold, Karl, and he... He laid you down and gently kissed you, didn't he, didn't he say any...

target
Han bar deg over terskelen og kysset deg.

baseline
Men da han bar deg over terskelen, Karl, og han... han la deg ned og forsiktig kysset deg, ikke sant, sa han ikke...

0.5
Men da han bar deg over terskelen, la han deg ned og kysset deg, ikke sant?

0.6
Men da han bar deg over terskelen, la han deg ned og kysset deg forsiktig, sa han ikke...
___


index: 355
source
>>nob<< All right, well...

target
Greit.

baseline
Ok, vel...

0.5
Greit.

0.6
Ok, ok...

0.7
Greit, ja vel...

___


```




In [None]:
import random
def peek(lang: str):
    data = all_transl[lang]
    index = random.randint(0, len(data["source"]) - 1)
    print("index:", index)

    sample = {}
    for key in translation_keys:
        print(key)
        print(data[key][index])
        print()
        sample[key] = data[key][index]
    
    #return sample

lang = "no"
peek(lang)