In [None]:
import locale
locale.getpreferredencoding = lambda: "UTF-8"

In [None]:
# Install transformers
!pip install git+https://github.com/huggingface/transformers.git

In [None]:
# Load the model
from transformers import BarkModel
import torch

model = BarkModel.from_pretrained("suno/bark") # Can be suno/bark

In [None]:
# Set model to GPU

device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

In [None]:
# Load Bank Processor to process incoming text
from transformers import AutoProcessor
processor = AutoProcessor.from_pretrained("suno/bark")

In [None]:
# Simple generate

# Processs
text_prompt = "Audio is generated with Bark. Hi from Ai Noodle!"
inputs = processor(text_prompt)

# generate speech
speech_output = model.generate(**inputs.to(device))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [None]:
# Play audio in COlab
from IPython.display import Audio

sampling_rate = model.generation_config.sample_rate
Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)

In [None]:
# Save to file
import scipy
scipy.io.wavfile.write("bark_out.wav", rate=sampling_rate, data=speech_output[0].cpu().numpy())

In [None]:
# Generate with voice speaker
voice_preset = "v2/en_speaker_9"

# prepare the inputs
text_prompt = "Audio is generated with Bark. Hi from Ai Noodle!"
inputs = processor(text_prompt, voice_preset=voice_preset)

# generate speech
speech_output = model.generate(**inputs.to(device))

# let's hear it
Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [None]:
# Multilingual speech - French - let's use a voice_preset as well
inputs = processor("J'aime la france.", voice_preset="v2/fr_speaker_3")

# generate speech
speech_output = model.generate(**inputs.to(device))

Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


In [None]:
# Adding non-speech cues to the input text
inputs = processor("[clears throat] Hello I love AI Noodle [laughter]")


# generate speech
speech_output = model.generate(**inputs.to(device))

Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
