## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd ..
import os, sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))

/Users/Tony/Other Docs/distilling-and-forgetting-in-large-pre-trained-models


In [3]:
from pathlib import Path

import numpy as np
import pandas as pd
from transformers.models.whisper import WhisperTokenizerFast

import matplotlib.pyplot as plt
import seaborn as sns

from evaluation.eval_dataset_name_to_dataset_group import EVAL_DATASET_NAME_TO_DATASET_GROUP
from evaluation.string_edit_metrics import get_string_edit_metrics_ortho_and_norm
from normalization.whisper_normalization import get_whisper_normalizer
from utils.file_io import load_json
from utils.whisper_hallucinations.dataloader import load_dataset
from utils.whisper_hallucinations.get_features import add_features_to_ds

sns.set_theme(context="paper", style="ticks")

## Load tokenizer

In [4]:
pretrained_model_name_or_path = "openai/whisper-tiny"
tokenizer = WhisperTokenizerFast.from_pretrained(pretrained_model_name_or_path, language="english", task="transcribe")

## Load dataset

In [5]:
dataset_name = "ami_validation"

ds = load_dataset(dataset_name)



Found cached dataset ami (/Users/Tony/.cache/huggingface/datasets/edinburghcstr___ami/ihm/0.0.0/0d128d0aa8145d0f16f3d5b4da86c5d5759dbe9e8f947fda04b25edb56442bd5)
Loading cached processed dataset at /Users/Tony/.cache/huggingface/datasets/edinburghcstr___ami/ihm/0.0.0/0d128d0aa8145d0f16f3d5b4da86c5d5759dbe9e8f947fda04b25edb56442bd5/cache-76a34bc037fa70e6.arrow
Loading cached processed dataset at /Users/Tony/.cache/huggingface/datasets/edinburghcstr___ami/ihm/0.0.0/0d128d0aa8145d0f16f3d5b4da86c5d5759dbe9e8f947fda04b25edb56442bd5/cache-8c6e325cf1e5403b.arrow


## Load predictions

In [6]:
cache_preds_filepath = "notebooks/data/whisper_preds/with_ts/ami_validation.json"
assert Path(cache_preds_filepath).is_file(), "`cache_preds_filepath` is incorrect."

data = load_json(cache_preds_filepath)
results = data["predictions"]
references = data["references"]
print(f"Loaded cached predictions from `{cache_preds_filepath}`.")

predictions = [x["text"] for x in results]
# NOTE: When alpha_ce = 0, we shouldn't lowercase the teacher predictions because the goal of 1-best KD is for
#       the student to learn to predict the raw teacher's predictions (without any normalization).

Loaded cached predictions from `notebooks/data/whisper_preds/with_ts/ami_validation.json`.


## Get string edit metrics

In [7]:
get_string_edit_metrics_ortho_and_norm(references, predictions, norm_fn=get_whisper_normalizer("english"))

{'WER ortho (%)': 46.33450233273304,
 'Sub ortho (%)': 31.775720619674996,
 'Del ortho (%)': 11.933272250481817,
 'Ins ortho (%)': 2.625509462576222,
 'WER (%)': 18.36682750941195,
 'Sub (%)': 6.105307810594411,
 'Del (%)': 8.533856514131184,
 'Ins (%)': 3.727663184686357}

## Add predictions to dataset

In [8]:
# Tokenize labels:
ds = ds.map(lambda batch: {"labels": tokenizer(batch["text"]).input_ids}, batched=True)

Loading cached processed dataset at /Users/Tony/.cache/huggingface/datasets/edinburghcstr___ami/ihm/0.0.0/0d128d0aa8145d0f16f3d5b4da86c5d5759dbe9e8f947fda04b25edb56442bd5/cache-8a9aeff0daa6368c.arrow


In [9]:
ds = add_features_to_ds(ds, results, tokenizer=tokenizer, lowercase_teacher=False)

ds.features

Map:   0%|          | 0/13098 [00:00<?, ? examples/s]

Map:   0%|          | 0/13098 [00:00<?, ? examples/s]

Map:   0%|          | 0/13098 [00:00<?, ? examples/s]

Map:   0%|          | 0/13098 [00:00<?, ? examples/s]

Map:   0%|          | 0/13098 [00:00<?, ? examples/s]

Map:   0%|          | 0/13098 [00:00<?, ? examples/s]

{'text': Value(dtype='string', id=None),
 'audio': Audio(sampling_rate=16000, mono=True, decode=True, id=None),
 'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'teacher_text': Value(dtype='string', id=None),
 'teacher_labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'audio_length': Value(dtype='float64', id=None),
 'n_tokens_labels': Value(dtype='int64', id=None),
 'n_tokens_teacher': Value(dtype='int64', id=None),
 'diff_n_tokens': Value(dtype='int64', id=None),
 'gzip_ratio': Value(dtype='float64', id=None),
 'teacher_gzip_ratio': Value(dtype='float64', id=None),
 'diff_gzip_ratio': Value(dtype='float64', id=None),
 'n_overlaps': Value(dtype='int64', id=None)}

In [10]:
ds[0]

{'text': "but like mobile phones have screens and they're cheap",
 'audio': {'path': None,
  'array': array([-1.22070312e-04, -9.15527344e-05, -9.15527344e-05, ...,
          1.52587891e-04,  3.05175781e-05,  1.83105469e-04]),
  'sampling_rate': 16000},
 'labels': [50258,
  50363,
  5955,
  411,
  6013,
  10216,
  362,
  11171,
  293,
  220,
  13162,
  434,
  7084,
  50257],
 'teacher_text': " But like mobile phones have screens and they're cheap.",
 'teacher_labels': [50258,
  50363,
  583,
  411,
  6013,
  10216,
  362,
  11171,
  293,
  220,
  13162,
  434,
  7084,
  13,
  50257],
 'audio_length': 2.68,
 'n_tokens_labels': 14,
 'n_tokens_teacher': 15,
 'diff_n_tokens': 1,
 'gzip_ratio': 0.7571428571428571,
 'teacher_gzip_ratio': 0.7432432432432432,
 'diff_gzip_ratio': -0.013899613899613916,
 'n_overlaps': 0}

In [11]:
savepath = f"notebooks/data/whisper_hallucinations_cached_ds/{dataset_name}"
Path(savepath).parent.mkdir(parents=True, exist_ok=True)
ds.save_to_disk(savepath)

print(f"Cached dataset at `{savepath}`")

Saving the dataset (0/3 shards):   0%|          | 0/13098 [00:00<?, ? examples/s]

Cached dataset at `notebooks/data/whisper_hallucinations_cached_ds/ami_validation`
