# Dependencies

In [None]:
!pip install seaborn matplotlib pandas numpy

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import json

# Training Experiment 1

In [None]:
root = "results/training_trainsize/"

data = {}
for f in os.listdir(root):
    if not f.endswith(".csv"):
        continue
    name = "".join(f.split("_loss_")).split(".csv")[0]
    if "train" in name:
        size = name.split("train")[1]
    elif "eval" in name:
        size = name.split("eval")[1]
    with open(os.path.join(root, f), "r") as file:
        if size not in data:
            data[size] = [pd.read_csv(file).drop(columns=["Wall time"])]
        else:
            data[size].append(pd.read_csv(file).drop(columns=["Wall time"]))
# sort by train size
data = {k: v for k, v in sorted(data.items(), key=lambda item: int(item[0]))}

fig, ax = plt.subplots(2, 3, figsize=(12, 8), sharey=True, sharex=False)
for i, (name, l) in enumerate(data.items()):
    k = int(i%3)
    i = int(i/3)
    for df in l:
        ax[i][k].plot(df["Step"], df["Value"], label="Train" if len(df) > 5 else "Validation")
        ax[i][k].legend().remove()
        ax[i][k].set_title(f"Train size: {name}%")
        ax[i][k].grid(True, axis='y')
        
ax[0][0].set_ylabel("Loss")
ax[1][0].set_ylabel("Loss")
ax[1][0].set_xlabel("Step")
ax[0][2].set_xlabel("Step")
ax[0][0].set_xticklabels([])
ax[0][1].set_xticklabels([])
ax[1][1].set_xlabel("Step")
ax[1][2].remove()
ax[0][2].legend()
fig.suptitle("Loss over time per train size")
plt.ylim(bottom=0)
plt.tight_layout()
plt.savefig("results/plots/training_trainsize.pdf")
plt.show()

# Training Experiment 2

In [None]:
root = "results/training_finetune_scratch/"

data = {}
for f in os.listdir(root):
    if not f.endswith(".csv"):
        continue
    name = "".join(f.split("_loss")).split(".csv")[0]
    if "train" in name:
        size = name.split("_train")[0]
    elif "eval" in name:
        size = name.split("_eval")[0]
    with open(os.path.join(root, f), "r") as file:
        if size not in data:
            data[size] = [pd.read_csv(file).drop(columns=["Wall time"])]
        else:
            data[size].append(pd.read_csv(file).drop(columns=["Wall time"]))

In [None]:
sub_data = {}
for k, v in data.items():
    if k.startswith("dialect"):
        sub_data[k] = v

fig, ax = plt.subplots(2, 3, figsize=(12, 8), sharey=True)
for i, (name, l) in enumerate(sub_data.items()):
    k = int(i%3)
    i = int(i/3)
    name = name.split("_")[1].title()
    for df in l:
        ax[i][k].plot(df["Step"], df["Value"], label="Train" if len(df) > 5 else "Validation", color="C1" if len(df) > 5 else "C0")
        ax[i][k].legend().remove()
        ax[i][k].set_title(f"Dialect: {name}")
        ax[i][k].grid(True, axis='y')
        
ax[0][0].set_ylabel("Loss")
ax[1][0].set_ylabel("Loss")
ax[1][0].set_xlabel("Step")
ax[0][2].set_xlabel("Step")
ax[1][1].set_xlabel("Step")
ax[0][1].set_xticklabels([])
ax[0][0].set_xticklabels([])
ax[1][2].remove()
ax[0][2].legend()

plt.ylim(bottom=0)
fig.suptitle("Loss over time per dialect (without pre-training)")
plt.tight_layout()
plt.savefig("results/plots/training_scratch.pdf")
plt.show()

In [None]:
sub_data = {}
for k, v in data.items():
    if k.startswith("finetune"):
        sub_data[k] = v

fig, ax = plt.subplots(2, 3, figsize=(12, 8), sharey=True)
for i, (name, l) in enumerate(sub_data.items()):
    k = int(i%3)
    i = int(i/3)
    name = name.split("_")[1].title()
    for df in l:
        ax[i][k].plot(df["Step"], df["Value"], label="Train" if len(df) > 5 else "Validation", color="C1" if len(df) > 5 else "C0")
        ax[i][k].legend().remove()
        ax[i][k].set_title(f"Dialect: {name}")
        ax[i][k].grid(True, axis='y')
        
ax[0][0].set_ylabel("Loss")
ax[1][0].set_ylabel("Loss")
ax[1][0].set_xlabel("Step")
ax[0][2].set_xlabel("Step")
ax[1][1].set_xlabel("Step")
ax[0][1].set_xticklabels([])
ax[0][0].set_xticklabels([])
ax[1][2].remove()
ax[0][2].legend()
plt.ylim(bottom=0)
fig.suptitle("Loss over time per dialect (with pre-training)")
plt.tight_layout()
plt.savefig("results/plots/training_finetuning.pdf")
plt.show()

# Training Experiment 3

In [None]:
root = "results/training_all/"

data = {}
for f in os.listdir(root):
    if not f.endswith(".csv"):
        continue
    name = "".join(f.split("_loss")).split(".csv")[0]
    if "train" in name:
        size = name.split("_train")[0]
    elif "eval" in name:
        size = name.split("_eval")[0]
    with open(os.path.join(root, f), "r") as file:
        if size not in data:
            data[size] = [pd.read_csv(file).drop(columns=["Wall time"])]
        else:
            data[size].append(pd.read_csv(file).drop(columns=["Wall time"]))

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12, 6), sharey=True)
for i, (name, l) in enumerate(data.items()):
    training = name.split("_")[0].title()
    training = "With pre-training" if training == "Finetune" else "Without pre-training"
    for df in l:
        ax[i].plot(df["Step"], df["Value"], label="Train" if len(df) > 5 else "Validation", color="C1" if len(df) > 5 else "C0")
        ax[i].legend().remove()
        ax[i].set_title(f"{training}")
        ax[i].grid(True, axis='y')
        ax[i].set_xlabel("Step")
        
ax[0].set_ylabel("Loss")
ax[1].legend()
fig.suptitle("Loss over time for pooled dialect data")
plt.ylim(bottom=0)
plt.tight_layout()
plt.savefig("results/plots/training_all.pdf")
plt.show()