## Imports

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
%cd ..
import os, sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))

In [None]:
from pathlib import Path
from collections import defaultdict

import numpy as np
import pandas as pd

from transformers.models.whisper import WhisperTokenizerFast
from datasets import load_from_disk

import matplotlib.pyplot as plt
import seaborn as sns

from evaluation.eval_dataset_name_to_dataset_group import EVAL_DATASET_NAME_TO_DATASET_GROUP
from evaluation.string_edit_metrics import get_string_edit_metrics_ortho_and_norm
from normalization.whisper_normalization import get_whisper_normalizer
from utils.whisper_hallucinations.get_features import add_features_to_ds, compute_gzip_compression_ratio
from utils.whisper_hallucinations.eval_filter_criterion import eval_filter_criterion
from utils.notebook_utils import listen_to_audio

sns.set_theme(context="paper", style="ticks")

OUTPUT_DIR = Path("notebooks/outputs/8_1_best_kd/ami_100h")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

## User input

In [None]:
load_from_pickle = True

pickle_filepath = "notebooks/data/ami_100h_medium_cached_33p.pkl"
ds_dirpath = "/home/tw581/rds/hpc-work/ami_100h_medium_cached_33p"

## Load tokenizer

In [None]:
pretrained_model_name_or_path = "openai/whisper-tiny"
tokenizer = WhisperTokenizerFast.from_pretrained(pretrained_model_name_or_path, language="english", task="transcribe")

## Load data

In [None]:
LIST_FEATURES = [
    'text',
    'teacher_text',
    'n_instant_tokens',
    'max_subarray_length',
    'audio_length',
    'n_tokens_labels',
    'n_tokens_teacher',
    'diff_n_tokens',
    'gzip_ratio',
    'teacher_gzip_ratio',
    'diff_gzip_ratio'
]

if load_from_pickle:
    df = pd.read_pickle(pickle_filepath)
else:
    ds = load_from_disk(ds_dirpath)
    ds = ds.map(lambda x: {"teacher_text": tokenizer.decode(x["teacher_sequences"], skip_special_tokens=True)})
    ds = add_features_to_ds(ds)
    df = pd.DataFrame({col: ds[col] for col in ds.features.keys() if col in LIST_FEATURES})
    df.to_pickle(pickle_filepath)

In [None]:
df.head()

In [None]:
df.sort_values("n_tokens_teacher", ascending=False)[:10]

## First analysis

In [None]:
dict_string_edit_metrics = get_string_edit_metrics_ortho_and_norm(references=df["text"], predictions=df["teacher_text"], norm_fn=get_whisper_normalizer("english"))

dict_string_edit_metrics

**Observation:** Using the Whisper normalizer drastically decreased the different string edit metric errors. Since 1-best operated without any normalization, we will focus on the orthographic WER in this study.

In [None]:
plt.figure(figsize=(5, 3))
df["audio_length"].plot.hist();

In [None]:
plt.figure(figsize=(12, 3))
sns.boxplot(data=df[["n_tokens_labels", "n_tokens_teacher"]], orient="h");

In [None]:
x_col = "n_tokens_labels"
y_col = "n_tokens_teacher"

sns.jointplot(data=df, x=x_col, y=y_col, alpha=0.3)
line_max_coord = min(df[x_col].max(), df[y_col].max())
plt.plot([0, line_max_coord], [0, line_max_coord], 'b--', label=r"$y=x$")
plt.legend()
plt.tight_layout()

savepath = OUTPUT_DIR / "analysis" / "n_tokens_teacher_wrt_n_tokens_label.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

In [None]:
plt.figure(figsize=(12, 1.5))
sns.boxplot(data=df[["diff_n_tokens"]], orient="h")

savepath = OUTPUT_DIR / "analysis" / "boxplot_n_diff_tokens.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

In [None]:
plt.figure(figsize=(12, 1.5))
sns.boxplot(data=df[["diff_n_tokens"]], orient="h")
plt.xlim(-50, 50);

In [None]:
sns.scatterplot(data=df, x="audio_length", y="diff_n_tokens", alpha=0.3);

In [None]:
fig, axis = plt.subplots(1, 2, figsize=(8, 3), sharey=True)
sns.scatterplot(data=df, x="audio_length", y="n_tokens_labels", label="Labels", alpha=0.3, ax=axis[0])
sns.scatterplot(data=df, x="audio_length", y="n_tokens_teacher", label="Predictions", c="coral", alpha=0.3, ax=axis[1])
axis[0].set_ylabel("n_tokens")
fig.tight_layout()

savepath = OUTPUT_DIR / "analysis" / "n_tokens_wrt_audio_length.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

## Criteria for filtering

### Exceeding tokens

In [None]:
THRESH_DELTA_N_TOKENS = df["diff_n_tokens"].mean() + 1.0 * df["diff_n_tokens"].std()
THRESH_DELTA_N_TOKENS

In [None]:
# Define the columns to use for the plot
x_col = "n_tokens_labels"
y_col = "n_tokens_teacher"

# Define the value of delta
delta = THRESH_DELTA_N_TOKENS

# Create the joint plot
sns.jointplot(data=df, x=x_col, y=y_col, alpha=0.3)

# Plot the line y = x + delta
plt.plot([df[x_col].min(), df[x_col].max()], [df[x_col].min() + delta, df[x_col].max() + delta], 'r--', label=r"$y = x + \delta$")

# Fill the area above the line y = x + delta
plt.fill_between([0, df[x_col].max()],
                 [0 + delta, df[x_col].max() + delta], df[y_col].max(),
                 alpha=0.2,
                 color="red")

# Plot the line y = x
plt.plot([df[x_col].min(), df[x_col].max()], [df[x_col].min(), df[x_col].max()], 'k--', label=r"$y = x$")

# Add legend and set axis limits
plt.legend()
plt.xlim(0, df[x_col].max())
plt.ylim(0, df[y_col].max())

# Adjust the layout and show the plot
plt.tight_layout()

# Save figure
savepath = OUTPUT_DIR / "exceeding_tokens_filtering" / "exceeding_tokens_filter_pairplot.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

In [None]:
# Define the columns to use for the plot
x_col = "n_tokens_labels"
y_col = "n_tokens_teacher"

# Define the value of delta
delta = THRESH_DELTA_N_TOKENS

# Create the joint plot
sns.jointplot(data=df, x=x_col, y=y_col, alpha=0.3)

# Plot the line y = x + delta
plt.plot([df[x_col].min(), df[x_col].max()], [df[x_col].min() + delta, df[x_col].max() + delta], 'r--', label=r"$y = x + \delta$")

# Fill the area above the line y = x + delta
plt.fill_between([0, df[x_col].max()],
                 [0 + delta, df[x_col].max() + delta], df[y_col].max(),
                 alpha=0.2,
                 color="red")

# Plot the line y = x
plt.plot([df[x_col].min(), df[x_col].max()], [df[x_col].min(), df[x_col].max()], 'k--', label=r"$y = x$")

# Add legend and set axis limits
plt.legend()
line_max_coord = min(df[x_col].max(), df[y_col].max())
plt.xlim(0, line_max_coord)
plt.ylim(0, line_max_coord)

# Adjust the layout and show the plot
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(6, 6))

# create a histogram of the "diff_n_tokens" column
sns.histplot(data=df, x="diff_n_tokens", bins=100)

# shade the area that will be filtered out in red
plt.axvspan(THRESH_DELTA_N_TOKENS, df["diff_n_tokens"].max(), color='red', alpha=0.2, label="Filtered out")

plt.legend()

# show the plot
plt.tight_layout()

# Save figure
savepath = OUTPUT_DIR / "exceeding_tokens_filtering" / "exceeding_tokens_filter_hist.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

In [None]:
df["high_diff_n_tokens"] = (df["diff_n_tokens"] > THRESH_DELTA_N_TOKENS)

df["high_diff_n_tokens"].value_counts()

In [None]:
df_candidates = df[df["high_diff_n_tokens"]]

for idx in range(len(df_candidates)):
    print(f"Idx = {df_candidates.iloc[idx].name}")
    print("Reference: ", df_candidates.iloc[idx]["text"])
    print("Prediction: ", df_candidates.iloc[idx]["teacher_text"])
    print()

In [None]:
if not load_from_pickle:
    list_idx_to_listen = df_candidates.index[:5]
    listen_to_audio(ds, list_idx_to_listen, pred_col="teacher_text")

In [None]:
eval_filter_criterion(df, df_filter=(df["diff_n_tokens"] <= THRESH_DELTA_N_TOKENS))

**Observation:** Good criterion. Note that this also tackles the fact that the vanilla Whisper models tend to transcribe even the speeches in the background. By removing them from the training set, we hope to teach Whisper to only focus on the main speaker.

### gzip compression ratio

#### Analysis

In [None]:
# Quick example to confirm the intuition:
ref = "yeah so uh what we'll do is uh"
pred = "So, what we will do is, we will do is, we will do is, we will do is, we will do is,"

print("gzip ratios:")
print("- reference: ", compute_gzip_compression_ratio(ref))
print("- prediction: ", compute_gzip_compression_ratio(pred))

In [None]:
x_col = "gzip_ratio"
y_col = "teacher_gzip_ratio"

sns.jointplot(data=df, x=x_col, y=y_col, alpha=0.3)
line_max_coord = min(df[x_col].max(), df[y_col].max())
plt.plot([0, line_max_coord], [0, line_max_coord], 'b--', label=r"$y=x$")
plt.legend()
# plt.xlim(-1, line_max_coord)
# plt.ylim(-1, line_max_coord)
plt.tight_layout()

savepath = OUTPUT_DIR / "gzip_ratio_filtering" / "pairplot_gzip.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

In [None]:
x_col = "gzip_ratio"
y_col = "teacher_gzip_ratio"

sns.jointplot(data=df, x=x_col, y=y_col, alpha=0.3)
line_max_coord = min(df[x_col].max(), df[y_col].max())
plt.plot([0, line_max_coord], [0, line_max_coord], 'b--', label=r"$y=x$")
plt.legend()
plt.xlim(0, line_max_coord)
plt.ylim(0, 2.5)
plt.tight_layout()

In [None]:
plt.figure(figsize=(12, 1.5))
sns.boxplot(data=df[["diff_gzip_ratio"]], orient="h")

# Save figure:
savepath = OUTPUT_DIR / "gzip_ratio_filtering" / "boxplot_gzip_ratio.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

#### Using only the teacher gzip ratio

In [None]:
THRESH_TEACHER_GZIP = df["teacher_gzip_ratio"].mean() + 2 * df["teacher_gzip_ratio"].std()

THRESH_TEACHER_GZIP

In [None]:
plt.figure(figsize=(6, 6))

# create a histogram of the "diff_n_tokens" column
sns.histplot(data=df, x="teacher_gzip_ratio")

# shade the area that will be filtered out in red
plt.axvspan(THRESH_TEACHER_GZIP, df["teacher_gzip_ratio"].max(), color='red', alpha=0.2, label="Filtered out")

plt.legend()
plt.xlim(-1, 5)

# show the plot
plt.tight_layout()

# Save figure:
savepath = OUTPUT_DIR  / "gzip_ratio_filtering" / "teacher_gzip_ratio" / "hist_teacher_gzip_filter.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

In [None]:
df_candidates = df[df["teacher_gzip_ratio"] > THRESH_TEACHER_GZIP]

for idx in range(len(df_candidates)):
    print(f"Idx = {df_candidates.iloc[idx].name}")
    print("Reference: ", df_candidates.iloc[idx]["text"])
    print("Prediction: ", df_candidates.iloc[idx]["teacher_text"])
    print()

In [None]:
eval_filter_criterion(df, df_filter=(df["teacher_gzip_ratio"] <= THRESH_TEACHER_GZIP))

#### Using the absolute difference in GZIP ratio with the ground-truth

In [None]:
THRESH_DIFF_GZIP = df["diff_gzip_ratio"].mean() + 1.0 * df["diff_gzip_ratio"].std()

THRESH_DIFF_GZIP

In [None]:
# Define the columns to use for the plot
x_col = "gzip_ratio"
y_col = "teacher_gzip_ratio"


# Define the value of delta
delta = THRESH_DIFF_GZIP

# Create the joint plot
sns.jointplot(data=df, x=x_col, y=y_col, alpha=0.3)

# Plot the line y = x + delta
plt.plot([df[x_col].min(), df[x_col].max()], [df[x_col].min() + delta, df[x_col].max() + delta], 'r--', label=r"$y = x + \delta$")

# Fill the area above the line y = x + delta
plt.fill_between([0, df[x_col].max()],
                 [0 + delta, df[x_col].max() + delta], df[y_col].max(),
                 alpha=0.2,
                 color="red")

# Plot the line y = x
plt.plot([df[x_col].min(), df[x_col].max()], [df[x_col].min(), df[x_col].max()], 'k--', label=r"$y = x$")

# Plot the line y = x - delta
plt.plot([df[x_col].min(), df[x_col].max()], [df[x_col].min() - delta, df[x_col].max() - delta], 'r--', label=r"$y = x - \delta$")

# Fill the area below the line y = x - delta
plt.fill_between([0, df[x_col].max()],
                 [0 - delta, df[x_col].max() - delta], df[y_col].min(),
                 alpha=0.2,
                 color="red")

# Add legend and set axis limits
plt.legend()
plt.xlim(0, df[x_col].max())
plt.ylim(0, df[y_col].max())

# Adjust the layout and show the plot
plt.tight_layout()

# Save figure:
savepath = OUTPUT_DIR  / "analysis" / "gzip_ratio_filtering" / "diff_gzip_ratio" / "pairplot_gzip_filter.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

In [None]:
# Define the columns to use for the plot
x_col = "gzip_ratio"
y_col = "teacher_gzip_ratio"


# Define the value of delta
delta = THRESH_DIFF_GZIP

# Create the joint plot
sns.jointplot(data=df, x=x_col, y=y_col, alpha=0.3)

# Plot the line y = x + delta
plt.plot([df[x_col].min(), df[x_col].max()], [df[x_col].min() + delta, df[x_col].max() + delta], 'r--', label=r"$y = x + \delta$")

# Fill the area above the line y = x + delta
plt.fill_between([0, df[x_col].max()],
                 [0 + delta, df[x_col].max() + delta], df[y_col].max(),
                 alpha=0.2,
                 color="red")

# Plot the line y = x
plt.plot([df[x_col].min(), df[x_col].max()], [df[x_col].min(), df[x_col].max()], 'k--', label=r"$y = x$")

# Plot the line y = x - delta
plt.plot([df[x_col].min(), df[x_col].max()], [df[x_col].min() - delta, df[x_col].max() - delta], 'r--', label=r"$y = x - \delta$")

# Fill the area below the line y = x - delta
plt.fill_between([0, df[x_col].max()],
                 [0 - delta, df[x_col].max() - delta], df[y_col].min(),
                 alpha=0.2,
                 color="red")

# Add legend and set axis limits
plt.legend()
line_max_coord = min(df[x_col].max(), df[y_col].max())
plt.xlim(0, line_max_coord)
plt.ylim(0, line_max_coord)

# Adjust the layout and show the plot
plt.tight_layout()

In [None]:
plt.figure(figsize=(6, 6))

# create a histogram of the "diff_n_tokens" column
sns.histplot(data=df, x="diff_gzip_ratio")

# shade the area that will be filtered out in red
plt.axvspan(THRESH_DIFF_GZIP, df["diff_gzip_ratio"].max(), color='red', alpha=0.2, label="Filtered out")
plt.axvspan(-THRESH_DIFF_GZIP, df["diff_gzip_ratio"].min(), color='red', alpha=0.2)

plt.legend()
plt.xlim(-1, 1)

# show the plot
plt.tight_layout()

# Save figure:
savepath = OUTPUT_DIR / "analysis" / "gzip_ratio_filtering" / "diff_gzip_ratio" / "hist_gzip_filter.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

In [None]:
df_candidates = df[df["diff_gzip_ratio"] > THRESH_DIFF_GZIP]

for idx in range(len(df_candidates)):
    print(f"Idx = {df_candidates.iloc[idx].name}")
    print("Reference: ", df_candidates.iloc[idx]["text"])
    print("Prediction: ", df_candidates.iloc[idx]["teacher_text"])
    print()

In [None]:
df_candidates = df[df["diff_gzip_ratio"] < - THRESH_DIFF_GZIP]

for idx in range(len(df_candidates)):
    print(f"Idx = {df_candidates.iloc[idx].name}")
    print("Reference: ", df_candidates.iloc[idx]["text"])
    print("Prediction: ", df_candidates.iloc[idx]["teacher_text"])
    print()

In [None]:
eval_filter_criterion(df, df_filter=(df["diff_gzip_ratio"].abs() <= THRESH_DIFF_GZIP))

### Timestamp overlaps

In [None]:
df["n_instant_tokens_norm"] = df["n_instant_tokens"] / df["n_tokens_teacher"]
df["max_subarray_length_norm"] = df["max_subarray_length"] / df["n_tokens_teacher"]

#### Using the normalized number of instant tokens

In [None]:
plt.figure(figsize=(7, 3))
df["n_instant_tokens_norm"].plot.hist(bins=50);

In [None]:
x_col = "diff_n_tokens"
y_col = "n_instant_tokens_norm"

sns.jointplot(data=df, x=x_col, y=y_col, alpha=0.3, height=6)
plt.tight_layout()

savepath = OUTPUT_DIR / "timestamp_based" / "instant_tokens" / "n_instant_tokens_norm_wrt_diff_n_tokens.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

In [None]:
x_col = "n_instant_tokens_norm"
hue = "high_diff_n_tokens"

fig, axis = plt.subplots(2, 1, sharex=True, figsize=(8, 6))
sns.histplot(data=df[~df["high_diff_n_tokens"]], x=x_col, label="low_diff_n_tokens", ax=axis[0])
axis[0].legend()
sns.histplot(data=df[df["high_diff_n_tokens"]], x=x_col, label="high_diff_n_tokens", color="coral", ax=axis[1])
axis[1].legend()
fig.tight_layout()

In [None]:
df_candidates = df[df["n_instant_tokens_norm"] > 0.95]

for idx in range(len(df_candidates)):
    print(f"Idx = {df_candidates.iloc[idx].name}")
    print("Reference: ", df_candidates.iloc[idx]["text"])
    print("Prediction: ", df_candidates.iloc[idx]["teacher_text"])
    print()

#### Using the normalized maximum subarray length

In [None]:
plt.figure(figsize=(7, 3))
df["max_subarray_length_norm"].plot.hist(bins=50);

In [None]:
x_col = "diff_n_tokens"
y_col = "max_subarray_length_norm"

sns.jointplot(data=df, x=x_col, y=y_col, alpha=0.3, height=6)
plt.tight_layout()

savepath = OUTPUT_DIR / "timestamp_based" / "max_subarray_length" / "max_subarray_length_norm_wrt_diff_n_tokens.png"
savepath.parent.mkdir(parents=True, exist_ok=True)
plt.savefig(savepath)
print(f"Figure saved at `{savepath}`.")

**Observation:** Not promising because we can't even discriminate the obvious hallucinations (the points with a high value of `diff_n_tokens` using `max_subarray_length_norm`).