# Inference examples

In [1]:
# Do not forget to install all dependencies first:
!pip install -Uqq WhisperSpeech

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m630.6/630.6 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m21.3/21.3 MB[0m [31m49.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.7/3.7 MB[0m [31m81.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m117.8/117.8 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m526.7/526.7 kB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for encodec (setup.py) ... [?25l[?25hdone


In [2]:
def is_colab():
    try: import google.colab; return True
    except: return False

import torch
if not torch.cuda.is_available():
    if is_colab(): raise BaseException("Please change the runtime type to GPU. In the menu: Runtime -> Change runtime type (the free T4 instance is enough)")
    else:          raise BaseException("Currently the example notebook requires CUDA, make sure you are running this on a machine with a GPU.")

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
import torch
import torch.nn.functional as F

from IPython.display import Markdown, HTML

## The whole pipeline

In [5]:
# check "7. Pipeline.ipynb"
from whisperspeech.pipeline import Pipeline

In [6]:
# let's start with the fast SD S2A model
pipe = Pipeline(s2a_ref='collabora/whisperspeech:s2a-q4-tiny-en+pl.model')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


t2s-small-en+pl.model:   0%|          | 0.00/856M [00:00<?, ?B/s]

s2a-q4-tiny-en+pl.model:   0%|          | 0.00/80.3M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/503 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/40.4M [00:00<?, ?B/s]

Downloading: "https://dl.fbaipublicfiles.com/encodec/v0/encodec_24khz-d7cc33bc.th" to /root/.cache/torch/hub/checkpoints/encodec_24khz-d7cc33bc.th
100%|██████████| 88.9M/88.9M [00:00<00:00, 107MB/s]


In [7]:
# this is very slow right now since our inference code is not very optimized
# but even without this crucial optimization it is still better than real-time on an RTX 4090
pipe.generate_to_notebook("""
This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.
""")



In [8]:
# The model knows how to speak in Polish
pipe.generate_to_notebook("""
To jest pierwszy test naszego modelu. Pozdrawiamy serdecznie.
""", lang='pl')

In [9]:
# We can also mix different languages (e.g. for borrowed words) in a single sentence
stoks = pipe.t2s.generate(["To jest pierwszy test wielojęzycznego ", " Whisper Speech ", ", modelu zamieniającego tekst na mowę, który Collabora i Laion nauczyli na superkomputerze", " Jewels."], lang=['pl', 'en', 'pl', 'en'])
pipe.vocoder.decode_to_notebook(pipe.s2a.generate(stoks, pipe.default_speaker.unsqueeze(0)))

RuntimeError: Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor

In [10]:
stoks = pipe.t2s.generate(["I love to eat eastern european food! Especially ", "pierogi i bigos."], lang=['en', 'pl'], cps=11)
pipe.vocoder.decode_to_notebook(pipe.s2a.generate(stoks, pipe.default_speaker.unsqueeze(0)))

RuntimeError: Number of dimensions of repeat dims can not be smaller than number of dimensions of tensor

## Voice cloning

In [13]:
# we can give it an audio file reference to get zero-shot voice cloning
#
# you can provide a URL or upload your own audio files
pipe.generate_to_notebook("""
This is the first demo of Whisper Speech, a fully open source text-to-speech model trained by Collabora and Lion on the Juwels supercomputer.
""", lang='en', speaker='https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg')

In [None]:
# it works even better (but slower) with the HQ model
pipe = Pipeline()

In [12]:
pipe.generate_to_notebook("""
You currently have zero compute units available. Resources offered free of charge are not guaranteed. Purchase more units here.
At your current usage level, this runtime may last up to 3 hours 10 minutes.
""", lang='en', speaker='https://upload.wikimedia.org/wikipedia/commons/7/75/Winston_Churchill_-_Be_Ye_Men_of_Valour.ogg')