## Imports

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd ..
import os, sys
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd()))))

/Users/Tony/Other Docs/distilling-and-forgetting-in-large-pre-trained-models


In [3]:
from functools import partial
from tqdm.auto import tqdm

import torch
from transformers import pipeline
from transformers.models.whisper import (WhisperTokenizer,
                                         WhisperTokenizerFast,
                                         WhisperFeatureExtractor,
                                         WhisperForConditionalGeneration)
from datasets import load_dataset
import evaluate

from dataloader.collator import DataCollatorSpeechSeq2SeqWithPadding
from dataloader.preprocessing_train.preprocessing import prepare_dataset_fct
from evaluation.eval_dataset_name_to_dataset_group import EVAL_DATASET_NAME_TO_DATASET_GROUP

if torch.cuda.is_available():
    device = "cuda:0"
elif torch.backends.mps.is_available():  # for Apple Silicon
    device = torch.device("mps")
else:
    device = "cpu"

## Load model

In [4]:
pretrained_model_name_or_path = "openai/whisper-tiny"

model = WhisperForConditionalGeneration.from_pretrained(pretrained_model_name_or_path)
feature_extractor = WhisperFeatureExtractor.from_pretrained(pretrained_model_name_or_path)
tokenizer = WhisperTokenizerFast.from_pretrained(pretrained_model_name_or_path, language="english", task="transcribe")

model.generate = partial(model.generate, language="english", task="transcribe",
                         max_length=255, use_cache=True)

## Load dataset

In [5]:
dataset_name = "librispeech_dummy"
# dataset_name = "ami"

ds = EVAL_DATASET_NAME_TO_DATASET_GROUP[dataset_name]()[dataset_name]

if dataset_name == "ami":
    ds = ds.select(list(range(32)))



Found cached dataset librispeech_asr_dummy (/Users/Tony/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_dummy/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b)


In [6]:
prepare_dataset = partial(prepare_dataset_fct, tokenizer=tokenizer, feature_extractor=feature_extractor)
ds = ds.map(prepare_dataset, num_proc=4).with_format("pt")

Loading cached processed dataset at /Users/Tony/.cache/huggingface/datasets/hf-internal-testing___librispeech_asr_dummy/clean/2.1.0/d3bc4c2bc2078fcde3ad0f0f635862e4c0fef78ba94c4a34c4c250a097af240b/cache-59ee67852172721a_*_of_00004.arrow


## Predict

In [7]:
x = ds[0]

x["text"]

'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'

In [8]:
outputs = model.generate(x["input_features"][None, ...], num_beams=3, num_return_sequences=3)

tokenizer.batch_decode(outputs, skip_special_tokens=True)

[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.',
 ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.',
 ' Mr. Quilter is the apostle of the Middle Classes and we are glad to welcome his gospel.']

In [12]:
outputs = model.generate(x["input_features"][None, ...], do_sample=True, num_return_sequences=3)

tokenizer.batch_decode(outputs, skip_special_tokens=True)

[' Mr. Krilder is the Apostle of the Middle Classes, and we are glad to welcome his gospel.',
 ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.',
 ' Mister Cilter is the apostle of the middle classes and we are glad to welcome his gospel.']

In [10]:
outputs = model.generate(x["input_features"][None, ...], do_sample=True, top_k=40, num_return_sequences=3)

tokenizer.batch_decode(outputs, skip_special_tokens=True)

[' Mr. Kylder is the apostle of the Middle Classes, and we are glad to welcome his gospel.',
 ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.',
 ' Mr. Quilter is the apostle of the middle classes and we are gladly to welcome his gospel.']

In [11]:
outputs = model.generate(x["input_features"][None, ...], do_sample=True, top_k=40, temperature=0.7, num_return_sequences=3)

tokenizer.batch_decode(outputs, skip_special_tokens=True)

[' Mr. Kfilter is the apostle of the Middle Classes and we are glad to welcome his gospel.',
 ' Mr. Quilder is the apostle of the middle classes and we are glad to welcome his gospel.',
 ' Mr. Kilder, is the apostle of the middle classes and we are glad to welcome his gospel.']

In [9]:
outputs = model.generate(x["input_features"][None, ...], do_sample=True, top_p=0.92, num_return_sequences=3)

tokenizer.batch_decode(outputs, skip_special_tokens=True)

[' Mr. Kwillter is the apostle of the middle classes and we are glad to welcome his Gospel.',
 ' Mr. Quilter is the apostle of the Middle classes and we are glad to welcome his gospel.',
 ' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']