In [None]:
from transformers import pipeline
from IPython.display import Audio

synthesiser = pipeline("text-to-speech", "suno/bark", device="cuda")

In [None]:
speech = synthesiser("(clears throat) Hey, my dog is better than your dog!", forward_params={"do_sample": True})

Audio(speech["audio"], rate=speech["sampling_rate"])

In [None]:
# alternate way
from transformers import AutoProcessor, BarkModel
import torch

# Set device (CUDA or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")  # Should print 'cuda'

# Define precision (use default single-precision floating-point FP32)
torch_dtype = torch.float32

# Set Bark pre-trained model
model_type = "suno/bark"  # Default

# Use AutoProcessor to instantiate appropriate multi-modal processor for Bark
processor = AutoProcessor.from_pretrained(model_type, torch_dtype=torch_dtype)

# Instantiate Bark model
model = BarkModel.from_pretrained(model_type, torch_dtype=torch_dtype)

# Move Bark model to CUDA device
model = model.to(device)

In [None]:
voice_preset = "v2/en_speaker_9"
text_prompt = '''
[clears throat] Hey bitch, my dog could kick your dog's ass no problem!
'''

# Pass voice preset and text prompt into processor
inputs = processor(text=text_prompt, voice_preset=voice_preset)

# Generate output audio arrays from input tensors
audio_arrays = model.generate(**inputs.to(device))

# Convert into NumPy array in CPU device (with removal of axes of size one from the shape of an array)
audio_arrays = audio_arrays.cpu().numpy().squeeze()

from IPython.display import Audio
import scipy, time

# Set sampling rate
sample_rate = model.generation_config.sample_rate  # Default 24000 Hz

# Download audio output as wav file
scipy.io.wavfile.write(f'out/{time.time}.wav',
                       rate=sample_rate,
                       data=audio_arrays)

# Convert audio array into audio sample played in widget
Audio(audio_arrays, rate=sample_rate)