In [None]:
! pip install git+https://github.com/openai/whisper.git

In [None]:
import io
import os
import numpy as np

try:
    import tensorflow  # required in Colab to avoid protobuf compatibility issues
except ImportError:
    pass

import torch
import pandas as pd
import urllib
import tarfile
import whisper
import torchaudio

from scipy.io import wavfile
from tqdm.notebook import tqdm
import ipywidgets as widgets

from transformers import SeamlessM4TModel


In [None]:

# TODO exchange for the proper data set setup

languages = {"af_za": "Afrikaans", "am_et": "Amharic", "ar_eg": "Arabic", "as_in": "Assamese", "az_az": "Azerbaijani", "be_by": "Belarusian", "bg_bg": "Bulgarian", "bn_in": "Bengali", "bs_ba": "Bosnian", "ca_es": "Catalan", "cmn_hans_cn": "Chinese", "cs_cz": "Czech", "cy_gb": "Welsh", "da_dk": "Danish", "de_de": "German", "el_gr": "Greek", "en_us": "English", "es_419": "Spanish", "et_ee": "Estonian", "fa_ir": "Persian", "fi_fi": "Finnish", "fil_ph": "Tagalog", "fr_fr": "French", "gl_es": "Galician", "gu_in": "Gujarati", "ha_ng": "Hausa", "he_il": "Hebrew", "hi_in": "Hindi", "hr_hr": "Croatian", "hu_hu": "Hungarian", "hy_am": "Armenian", "id_id": "Indonesian", "is_is": "Icelandic", "it_it": "Italian", "ja_jp": "Japanese", "jv_id": "Javanese", "ka_ge": "Georgian", "kk_kz": "Kazakh", "km_kh": "Khmer", "kn_in": "Kannada", "ko_kr": "Korean", "lb_lu": "Luxembourgish", "ln_cd": "Lingala", "lo_la": "Lao", "lt_lt": "Lithuanian", "lv_lv": "Latvian", "mi_nz": "Maori", "mk_mk": "Macedonian", "ml_in": "Malayalam", "mn_mn": "Mongolian", "mr_in": "Marathi", "ms_my": "Malay", "mt_mt": "Maltese", "my_mm": "Myanmar", "nb_no": "Norwegian", "ne_np": "Nepali", "nl_nl": "Dutch", "oc_fr": "Occitan", "pa_in": "Punjabi", "pl_pl": "Polish", "ps_af": "Pashto", "pt_br": "Portuguese", "ro_ro": "Romanian", "ru_ru": "Russian", "sd_in": "Sindhi", "sk_sk": "Slovak", "sl_si": "Slovenian", "sn_zw": "Shona", "so_so": "Somali", "sr_rs": "Serbian", "sv_se": "Swedish", "sw_ke": "Swahili", "ta_in": "Tamil", "te_in": "Telugu", "tg_tj": "Tajik", "th_th": "Thai", "tr_tr": "Turkish", "uk_ua": "Ukrainian", "ur_pk": "Urdu", "uz_uz": "Uzbek", "vi_vn": "Vietnamese", "yo_ng": "Yoruba"}
selection = widgets.Dropdown(
    options=[("Select language", None), ("----------", None)] + sorted([(f"{v} ({k})", k) for k, v in languages.items()]),
    value="ko_kr",
    description='Language:',
    disabled=False,
)

selection

lang = selection.value
language = languages[lang]

assert lang is not None, "Please select a language"



initialise the models used

In [None]:
# Whisper
asr_model = whisper.load_model("medium")
options = dict(language=language, beam_size=5, best_of=5)
transcribe_options = dict(task="transcribe", **options)
translate_options = dict(task="translate", **options)

# inhouse
#inhouse_model #= torch.hub.load()

# DeltaLM
delta_path= "https://deltalm.blob.core.windows.net/deltalm/deltalm-base.pt"
delta_model = torch.load(delta_path)
#delta_model = torch.hub.load("pytorch/fairseq", "transformer_lm.wmt19.en", tokenizer="moses", bpe="fastbpe")

# SeamlessLM
seamless_model = SeamlessM4TModel.from_pretrained("facebook/hf-seamless-m4t-medium")


Dataset setup:

In [None]:
tar_path = download_asset("dataset/IWSLT23.tst2023.en-de.tar.gz")
tar_item = "IWSLT23.tst2023.en-de/benchmark/en-de/tst2023/wav/ted_13587.wav"
with tarfile.open(tar_path, mode="r") as tarfile_:
    fileobj = tarfile_.extractfile(tar_item)
    waveform, sample_rate = torchaudio.load(fileobj)


#def download(url: str, target_path: str):
#    with urllib.request.urlopen(url) as source, open(target_path, "wb") as output:
#        with tqdm(total=int(source.info().get("Content-Length")), ncols=80, unit='iB', unit_scale=True, unit_divisor=1024) as loop:
#            while True:
#                buffer = source.read(8192)
#                if not buffer:
#                    break

#                output.write(buffer)
#                loop.update(len(buffer))




In [None]:


dataset = 'WMT21_DA_test'
data_root_path = '../data'
src_lang = 'en'
tgt_lang = 'de'

references = []
transcriptions = []
translations = []

for audio, text in tqdm(dataset):
    transcription = asr_model.transcribe(audio, **transcribe_options)["text"]
    translation = asr_model.transcribe(audio, **translate_options)["text"]
    
    transcriptions.append(transcription)
    translations.append(translation)
    references.append(text)



Running the model with whisper medium


In [None]:
asr_model = whisper.load_model("medium")
encoded = asr_model.encode(text, **options)
logits = asr_model.decode(encoded, **options)["logits"]


options = dict(language=language, beam_size=5, best_of=5)
transcribe_options = dict(task="transcribe", **options)
translate_options = dict(task="translate", **options)
references = []
transcriptions = []
translations = []
outputptobabilities = []
# Returns the last layer proabbliities of the model as a dict containing the decoded text and the segments and the language
asr_model.eval()


for audio, text in tqdm(dataset):
    outputptobability = asr_model(audio)
    transcription = asr_model.transcribe(audio, **transcribe_options)["text"]
    translation = asr_model.transcribe(audio, **translate_options)["text"]

    outputptobabilities.append(outputptobability)
    transcriptions.append(transcription)
    translations.append(translation)
    references.append(text)

To get the glass box info (in this case weights from the cross attention layers) from the models I will need Hooks to the layers. they are registered like this and will be called every time after the forward functuon is called:

Functions for the ASR part that are not hooks:
he translation probabaility approach expanded to the ASR part so the sequence level transcription probability normalised by audio length
The mean Transcription porbability of the system for the translation


In [None]:
def transcriptionProbability(tensor, **options):
    result = {"translationProb": torch.nn.functional.softmax(tensor, dim=-1),"translationMean":torch.mean(tensor, dim=-1).mean(dim=0)}
    return result


Functions that are needed for the NMT part of the approach

In [None]:
def standardDeviation(probs):
    return torch.sqrt(torch.mean(probs**2, dim=-1)-(torch.mean(probs, dim=-1) ** 2 ))

def translationProbabilities(tensor, **options):
    with torch.no_grad():
        return torch.nn.functional.log_softmax(tensor, dim=-1)
    
def softmaxEntropy(outputSequence):
    probs = torch.nn.functional.softmax(outputSequence, dim=-1)
    return -np.divide(1,len(outputSequence))* torch.sum(torch.sum(probs * torch.log(probs), dim=-1), dim=-1)


The Dropout based approach functions
These need to calculate the Mean and Variance over the results of several forward passes with the set dropout to model the Uncertainty. the Dropout perturbes the model parameters

TODO FIX THE DROPOUT SINCE IT PROBABLY DOESN'T WORK LIKE THIS RN 


In [None]:
def DropoutTranslationProbability(probabilities, **options):
    asr_model.config.dropout = 0.3
    with torch.no_grad():
        # as proposed by the paper
        # 30 interference passes for the posteriour probabily but 10 should also be fine
        probability= []
        for _ in range(30):
            probability.append(asr_model(probabilities))
        prob = torch.divide(torch.mean(probability,dim = -1), 30)
        var = torch.nn.var(probability, 30)
        return {"dropoutprobabilty":prob, "variance": var, "D-combo": (1-np.divide(prob,var))}

In [None]:
data = pd.DataFrame(dict(reference=references, transcription=transcriptions, translation=translations, probability=outputptobabilities))
data

Hooks if needed

In [None]:
# QKs = [None] * model.dims.n_text_layer

#for i, block in enumerate(model.decoder.blocks):
 #   block.cross_attn.register_forward_hook(
  #      lambda _, ins, outs, index=i: QKs.__setitem__(index, outs[-1])
   # )

