In [1]:
from IPython import get_ipython
import os

if 'google.colab' in str(get_ipython()):
    print("Running in Google Colab, installing requirements...")
    # URL of the requirements file
    requirements_url = "https://raw.githubusercontent.com/willdalh/ml-course/main/requirements.txt"

    # Check if the requirements file already exists
    if not os.path.exists('requirements.txt'):
        # Download the requirements file
        !wget {requirements_url}

    # Install the requirements
    !python -m pip install --user -q -r requirements.txt

# Transkribering med Whisper
Høsten 2022 lanserte OpenAI [Whisper](https://openai.com/research/whisper), en modell som transkriberer språk. Denne kan vi laste inn og kjøre lokalt.

In [2]:
import whisper

Fra [repositoryet](https://github.com/openai/whisper) oppgis det hvilke typer modeller som er tilgjengelige og ressursene som kreves. Vi prøver ut `tiny.en`.

<table>
<thead>
<tr>
<th align="center">Size</th>
<th align="center">Parameters</th>
<th align="center">English-only model</th>
<th align="center">Multilingual model</th>
<th align="center">Required VRAM</th>
<th align="center">Relative speed</th>
</tr>
</thead>
<tbody>
<tr>
<td align="center">tiny</td>
<td align="center">39 M</td>
<td align="center"><code>tiny.en</code></td>
<td align="center"><code>tiny</code></td>
<td align="center">~1 GB</td>
<td align="center">~32x</td>
</tr>
<tr>
<td align="center">base</td>
<td align="center">74 M</td>
<td align="center"><code>base.en</code></td>
<td align="center"><code>base</code></td>
<td align="center">~1 GB</td>
<td align="center">~16x</td>
</tr>
<tr>
<td align="center">small</td>
<td align="center">244 M</td>
<td align="center"><code>small.en</code></td>
<td align="center"><code>small</code></td>
<td align="center">~2 GB</td>
<td align="center">~6x</td>
</tr>
<tr>
<td align="center">medium</td>
<td align="center">769 M</td>
<td align="center"><code>medium.en</code></td>
<td align="center"><code>medium</code></td>
<td align="center">~5 GB</td>
<td align="center">~2x</td>
</tr>
<tr>
<td align="center">large</td>
<td align="center">1550 M</td>
<td align="center">N/A</td>
<td align="center"><code>large</code></td>
<td align="center">~10 GB</td>
<td align="center">1x</td>
</tr>
</tbody>
</table>

In [3]:
model = whisper.load_model("tiny.en")
print("Model architecture:")
print(model)

100%|█████████████████████████████████████| 72.1M/72.1M [00:47<00:00, 1.60MiB/s]


Model architecture:
Whisper(
  (encoder): AudioEncoder(
    (conv1): Conv1d(80, 384, kernel_size=(3,), stride=(1,), padding=(1,))
    (conv2): Conv1d(384, 384, kernel_size=(3,), stride=(2,), padding=(1,))
    (blocks): ModuleList(
      (0-3): 4 x ResidualAttentionBlock(
        (attn): MultiHeadAttention(
          (query): Linear(in_features=384, out_features=384, bias=True)
          (key): Linear(in_features=384, out_features=384, bias=False)
          (value): Linear(in_features=384, out_features=384, bias=True)
          (out): Linear(in_features=384, out_features=384, bias=True)
        )
        (attn_ln): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
        (mlp): Sequential(
          (0): Linear(in_features=384, out_features=1536, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=1536, out_features=384, bias=True)
        )
        (mlp_ln): LayerNorm((384,), eps=1e-05, elementwise_affine=True)
      )
    )
    (ln_post): LayerNor

In [4]:
import os
from IPython.display import Audio
import ipywidgets as widgets
audio_dir = "../res/audio"
def create_dropdown(default_value):
    files = list(filter(lambda x: "." in x, os.listdir(audio_dir)))
    return widgets.Dropdown(options=files, description='Audio file:', value=default_value)
dropdown = create_dropdown(default_value="english_example.mp3")
dropdown

Dropdown(description='Audio file:', options=('english_example.mp3', 'norwegian_example.mp3'), value='english_e…

In [5]:
file_path = f"{audio_dir}/{dropdown.value}"
Audio(file_path)

In [6]:
result = model.transcribe(file_path)
result["text"]



' Now I am to come death, a destroyer of worlds.'

## Norsk modell
Nasjonalbiblioteket finjusterer sine egne varianter av Whisper som spesialiserer seg på norsk tale. Vi kan prøve ut beta-versjonene gjennom Hugging Face.

In [7]:
from transformers import pipeline
asr = pipeline(
    "automatic-speech-recognition",
    "NbAiLab/nb-whisper-tiny-beta"
)

config.json:   0%|          | 0.00/2.25k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/151M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/3.54k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/822 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/836k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.48M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/34.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.08k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


preprocessor_config.json:   0%|          | 0.00/339 [00:00<?, ?B/s]

In [8]:
dropdown = create_dropdown(default_value="norwegian_example.mp3")
dropdown

Dropdown(description='Audio file:', index=1, options=('english_example.mp3', 'norwegian_example.mp3'), value='…

In [9]:
file_path = f"{audio_dir}/{dropdown.value}"
Audio(file_path)

In [10]:
file_path = f"{audio_dir}/{dropdown.value}"
asr(
    file_path,
    generate_kwargs={'task': 'transcribe', 'language': 'no'},
)

{'text': ' Hvorfor fullfører du ikke løpet? Hvorfor skal jeg sitte og høre på surre prat?'}

Transkriberer vi talen fra `norwegian_example.mp3`, ser vi at modellen fjerner mange unødvendige ord. Dette kommer av at mesteparten av dataen Nasjonalbiblioteket har brukt til trening er teksting fra NRK, og transkriberingene som gjøres av dem er gjerne kompakt for å få plass på TV-skjermen.