# Dependencies

In [None]:
!pip install seaborn matplotlib pandas numpy

In [None]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import json

# Experiment 1: The Effect of Train Size MSA

In [None]:
root = "results/ex_trainsize/"

data = {}
for f in os.listdir(root):
    if not f.endswith(".json"):
        continue
    name = f.split("results_common_voice_")[1].split(".json")[0]
    with open(os.path.join(root, f), "r") as file:
        data[name] = json.load(file)

df_trainsize = pd.DataFrame(data).drop(["eval_samples_per_second", "eval_steps_per_second", "eval_runtime", "eval_loss"])
df_trainsize = df_trainsize[["whisper-small", "0.2", "0.4", "0.6", "0.8", "1.0", "whisper-large-v3"]].T
df_trainsize.rename(index={"whisper-small": "whisper-small (0%)", "0.2": "20%", "0.4": "40%", "0.6": "60%", "0.8": "80%", "1.0": "100%"}, inplace=True)
df_trainsize.rename(columns={"eval_wer": "WER", "eval_cer": "CER"}, inplace=True)
df_trainsize

In [None]:
# plot the data in a bar plot with each row as a separate bar
df_trainsize.plot(kind='bar', stacked=True, figsize=(10, 5), rot=0, colormap='Set2')
plt.title("Model performance on MSA dataset with different training sizes", fontweight='bold')
plt.ylabel("Percentage")
plt.xlabel("Percentage of MSA training data used")
plt.legend(title="Metric")
plt.grid(axis='y')
plt.yticks(np.arange(0, 101, 10))
plt.tight_layout()
plt.savefig("results/plots/ex_trainsize.pdf")
plt.show()

# Experiment 2: Comparison with and without pre-training

In [None]:
def load_results(root, prefix):
    data = {}
    dfs = []
    files = os.listdir(root)
    files.sort()
    for f in files:
        if not f.endswith(".json"):
            continue
        name = f.split(f"results_whisper-small-{prefix}_")[1].split(".json")[0]
        with open(os.path.join(root, f), "r") as file:
            data[name] = json.load(file)
        df = pd.DataFrame(data[name]).drop(["eval_samples_per_second", "eval_steps_per_second", "eval_runtime", "eval_loss"])
        df["model"] = name.title()
        df.columns = df.columns.str.title()
        df.rename(columns={"Msa": "MSA"}, inplace=True)
        dfs.append(df)

    df = pd.concat(dfs).T
    df.columns = pd.MultiIndex.from_tuples([tuple(c.split("_")) for c in df.columns])
    df.columns = df.columns.droplevel(0)
    df.columns = pd.MultiIndex.from_tuples([(df.iloc[-1].iloc[i], c) for i, c in enumerate(df.columns)], names=["model", "metric"])
    df = df.drop(df.index[-1])
    df = df.T
    df_wer = df.xs("wer", level="metric")
    df_cer = df.xs("cer", level="metric")
    df = pd.concat([df_wer, df_cer], keys=["WER", "CER"], names=["metric"])
    df = df.astype(float).round(2)
    df = df.reindex([("WER", "All")] + [("WER", k) for k in list(df.index.levels[1]) if k != "All"] + [("CER", "All")] + [("CER", k) for k in list(df.index.levels[1]) if k != "All"])
    df.to_latex(f"{root}.tex", multirow=True, multicolumn=True, multicolumn_format="c", bold_rows=True, caption="Model performance on each test set", label=f"tab:{root.split('/')[1]}", float_format="%.2f")
    return df

In [None]:
df_scratch = load_results("results/ex_scratch", "dialect")
df_finetune = load_results("results/ex_finetune", "finetune")
df_full = pd.concat([df_finetune, df_scratch], keys=["With", "Without"], names=["Pre-Training"])
df_full.index.names = ["Pre-Training", "Metric", "Train set"]
df_full.to_latex("results/table.tex", multirow=True, multicolumn=True, multicolumn_format="c", bold_rows=True, caption="Model performance on each test set without pre-training", label="tab:all_res", float_format="%.2f")
df_full

In [None]:
for res in ["WER", "CER"]:
    fig, ax = plt.subplots(2, 3, figsize=(12, 8), sharey=True)
    names = [c if c != "MSA" else "All" for c in df_full.columns ]
    for i, name in enumerate(list(names)):
        k = int(i%3)
        i = int(i/3)
        df_new = pd.concat([df_finetune.loc[res].loc[[name]], df_scratch.loc[res].loc[[name]]], keys=["With", "Without"], names=["Pre-Training"])
        df_new = df_new.T
        df_new.plot(kind='bar', ax=ax[i][k], colormap='Set2', rot=45)
        ax[i][k].legend().remove()
        ax[i][k].set_title(f"Dialect: {name}")
        ax[i][k].grid(True, axis='y')
            
    ax[0][0].set_ylabel(f"{res} (%)")
    ax[1][0].set_ylabel(f"{res} (%)")
    ax[0][0].legend(ncol=1, fancybox=True, shadow=True, title="Pre-Training", loc="upper left", labels=["With", "Without"])
    for i in range(3):
        ax[1][i].set_xlabel("Dialect (Test set)")
        ax[0][i].set_xticklabels([])

    plt.ylim(bottom=0)
    fig.suptitle("Model performance on each test set with and without pre-training", fontweight='bold')
    plt.tight_layout()

    plt.savefig(f"results/plots/ex_comparison_{res.lower()}.pdf")
    plt.show()

## Statistical tests

In [None]:
from scipy import stats

scratch_dist = df_scratch.loc['WER'].drop('Whisper-Small')
finetune_dist = df_finetune.loc['WER'].drop('Whisper-Small')

mean_scratch, std_scratch = scratch_dist.mean(), scratch_dist.std()
mean_finetune, std_finetune = finetune_dist.mean(), finetune_dist.std()

z_value, p_value = stats.wilcoxon(scratch_dist, finetune_dist)
test_df = pd.DataFrame({
    'Mean Without': mean_scratch,
    'Std Without': std_scratch,
    'Mean With': mean_finetune,
    'Std With': std_finetune,
    'Z-statistic': z_value,
    'p-value': p_value,
}, index=mean_scratch.index)
test_df = test_df.round(2)
test_df['Mean Without'] = test_df['Mean Without'].astype(str) + ' ± ' + test_df['Std Without'].round(2).astype(str)
test_df['Mean With'] = test_df['Mean With'].astype(str) + ' ± ' + test_df['Std With'].round(2).astype(str)
test_df = test_df.drop(columns=['Std Without', 'Std With'])
test_df.rename(columns={'Mean Without': 'Without pre-training', 'Mean With': 'With pre-training'}, inplace=True)
test_df.to_latex("results/wilcoxon.tex", bold_rows=True, caption="Wilcoxon results comparing the performance of models with and without pre-training", label="tab:wilcoxon", float_format="%.2f")
test_df

In [None]:
scratch_all_dist = scratch_dist.values.flatten()
finetune_all_dist = finetune_dist.values.flatten()
z_value, p_value = stats.wilcoxon(scratch_all_dist, finetune_all_dist)
print(f"Z-value: {z_value:.2f}, p-value: {p_value:.2f}")
print(f"Mean without pre-training: {scratch_all_dist.mean():.2f}, Mean with pre-training: {finetune_all_dist.mean():.2f}")
print(f"Standard deviation without pre-training: {scratch_all_dist.std():.2f}, Standard deviation with pre-training: {finetune_all_dist.std():.2f}")
print(f"Degrees of freedom: {finetune_all_dist.size - 1}")

In [None]:
import matplotlib.patches as mpatches
import matplotlib

fig, ax = plt.subplots(figsize=(12, 10))
cmap = matplotlib.colormaps['Set2']
colors = [cmap(i) for i in range(2)]

scratch_dist.plot(kind='box', ax=ax, positions=np.arange(0, len(scratch_dist)*2, 2), showmeans=True, meanline=True, patch_artist=True, boxprops=dict(facecolor=colors[0]), label="Without")
finetune_dist.plot(kind='box', ax=ax, positions=[i-0.5 for i in np.arange(1, len(scratch_dist)*2, 2)], showmeans=True, meanline=True, patch_artist=True, boxprops=dict(facecolor=colors[1]), label="With")
ax.set_xticks(np.arange(0.5, len(scratch_dist)*2, 2))
ax.set_xticklabels(scratch_dist.T.index)
ax.set_ylabel("WER (%)")
ax.set_xlabel("Dialect")
ax.legend(["Without", "With"], title="Pre-Training")
plt.title("Model performance on each test set without pre-training", fontweight='bold')
plt.tight_layout()
plt.grid(axis='y')

plt.legend(loc='upper right',
            ncol=1, fancybox=True, shadow=True, title="Pre-Training", labels=["Without", "With"], handles=[mpatches.Patch(facecolor=colors[0], label="Without"), mpatches.Patch(facecolor=colors[1], label="With")])
plt.savefig("results/plots/ex_boxplot_within.pdf")
plt.show()

# Experiment 3: Dialectal Arabic Fine-Tuning

In [None]:
df_finetune = load_results("results/ex_finetune", "finetune")
df_scratch = load_results("results/ex_scratch", "dialect")

In [None]:
for dialect in ["Egyptian", "Levantine", "Gulf", "Iraqi", "Maghrebi"]:
    new_df = df_finetune.loc["WER"].drop(columns=["MSA", dialect]).loc[dialect]
    print(dialect, new_df.mean().round(2))


In [None]:
def plot_cm(df, metric, pretraining):
    fig, ax = plt.subplots(1, 1, figsize=(8, 6))
    cm_df = df.drop("All", level="model")
    cm_df = cm_df.drop("Whisper-Small", level="model")
    cm_df = cm_df.drop(columns=["MSA"])
    cm_df.columns = list(cm_df.loc["WER"].index)

    sns.heatmap(cm_df.loc[metric], annot=True, cmap='viridis_r', fmt='g', cbar=True, ax=ax)
    plt.title(f"Model performance in {metric} (%) against each dialect {pretraining} pre-training")
    plt.ylabel("Dialect fine-tuned on")
    plt.xlabel("Dialect tested on")
    plt.xticks(rotation=45)
    plt.yticks(rotation=0)
    plt.tight_layout()
    plt.savefig(f"results/plots/ex_{'finetune' if pretraining == 'with' else 'scratch'}_cm_{metric.lower()}.pdf")
    plt.show()


In [None]:
plot_cm(df_finetune, "WER", "with")
plot_cm(df_finetune, "CER", "with")
plot_cm(df_scratch, "WER", "without")
plot_cm(df_scratch, "CER", "without")

In [None]:
def plot_results(df, title, filename):
    fig, ax = plt.subplots(2, 1, figsize=(15, 10), sharex=True, sharey=False)
    for i, res in enumerate(["WER", "CER"]):
        axes = df.loc[res].plot(kind='bar', ax=ax[i], colormap='Set2', rot=0)
        ax[i].set_ylabel(f"{res} (%)")
        ax[i].set_ylim(0, 120 + 40 * (i == 0))
        ax[i].set_yticks(np.arange(0, 120 + 40 * (i == 0), 20))
        ax[i].grid(axis='y')
        for g in axes.containers:
            g_label = g.get_label()
            for i, child in enumerate(g.get_children()):
                bar_name = df.loc[res].iloc[i].name
                if bar_name == g_label:
                    child.set(hatch='//')

    ax[1].set_xlabel("Dialect(s) trained on")
    ax[0].legend(loc='upper right', bbox_to_anchor=(1.1, 1.02), ncol=1, fancybox=True, shadow=True, title="Test set")
    ax[1].legend().remove()
    fig.suptitle(title, fontweight='bold')
    plt.tight_layout()
    plt.savefig(filename)
    plt.show()

In [None]:
plot_results(df_scratch, "Model performance on each test set without pre-training", "results/plots/ex_scratch.pdf")
plot_results(df_finetune, "Model performance on each test set with pre-training", "results/plots/ex_finetune.pdf")

In [None]:
import matplotlib

fig, ax = plt.subplots(2, 1, figsize=(15, 10), sharex=True, sharey=False)
cmap = matplotlib.colormaps['Set2']
colors = [cmap(i) for i in range(2)]
dfs = []

width = 0.4
for i, res in enumerate(["WER", "CER"]):
    multiplier = 0
    all_res = dict(df_finetune.loc[res].loc["All"].items())
    x = np.arange(len(all_res.keys()))
    y1 = [all_res[k] for k in all_res.keys()]
    y2 = [df_finetune.loc[res].loc[k][k] for k in all_res.keys() if k != "MSA"] + [df_trainsize.loc["100%"].loc[res]]
    df_diff = pd.DataFrame({"Dialect-pooled": y1, "Dialect-specific": y2}, index=all_res.keys()).round(2)
    mae = np.mean(np.abs(np.array(y1) - np.array(y2)))
    print(f"Mean absolute error between dialect-pooled and dialect-specific models for {res}: {mae:.2f}")

    ax[i].bar(x-0.2 + multiplier, y1, width, label=f"Dialect-pooled", color=colors[0])
    ax[i].bar(x+0.2 + multiplier, y2, width, label=f"Dialect-specific", color=colors[1])
    ax[i].grid(axis='y')
    ax[i].set_ylim(bottom=0)
    ax[i].set_ylabel(f"{res} (%)")
    multiplier += 0.4
    dfs.append(df_diff)

# concatenate the dataframes with a multi index and save them to a latex file
df_diff = pd.concat(dfs, keys=["WER", "CER"], names=["Metric"])
df_diff.to_latex(f"results/ex_comparison_dialectal.tex", multirow=True, multicolumn=True, multicolumn_format="c", bold_rows=True, caption=f"Model performance comparison dialect-pooled vs dialect-specific for {res}", label=f"tab:ex_comparison_dialectal", float_format="%.2f")
plt.xticks(x, [k for k in all_res.keys()])
plt.xlabel("Test set")
plt.legend(title="Model")
fig.suptitle("Model performance comparison dialect-pooled vs dialect-specific", fontweight='bold')
plt.tight_layout()
plt.savefig("results/plots/ex_comparison_pooled.pdf")
plt.show()