In [1]:
import logging
from transformers import AutoProcessor, AutoModel
import torch
from transformers import AutoProcessor, AutoModelForSeq2SeqLM

  from .autonotebook import tqdm as notebook_tqdm


In [14]:
# Initialise logging
logging.basicConfig(level=logging.INFO)

def initialise_model(model_name="suno/bark-small"):
    """Initialise the processor and model"""
    try:
        processor = AutoProcessor.from_pretrained(model_name)
        model = AutoModel.from_pretrained(model_name)
        logging.info("Model and processor initialised successfully.")
        return processor, model
    except Exception as e:
        logging.error(f"Failed to initialise model and processor: {e}")
        return None, None

def generate_speech(texts, processor, model, voice_preset="v2/en_speaker_6"): # change voice here
    """Generate speech from the input texts"""
    try:
        inputs = processor(
            text=texts,
            return_tensors="pt",
            voice_preset=voice_preset,
            max_length=512,  
        )
        # Generate the attention mask
        inputs["attention_mask"] = inputs["input_ids"].ne(processor.pad_token_id).int()
        
        speech_values = model.generate(
            input_ids=inputs["input_ids"], 
            attention_mask=inputs["attention_mask"], 
            do_sample=True
        )
        logging.info("Speech generated successfully.")
        return speech_values
    except Exception as e:
        logging.error(f"Failed to generate speech: {e}")
        return None


def play_audio(speech_values):
    """Play the generated speech audio."""
    try:
        # save the audio to a file
        sf.write('output.wav', speech_values.numpy(), 22050)

        # Initialise pyaudio
        p = pyaudio.PyAudio()

        # open the audio file and play it
        wf = wave.open('output.wav', 'rb')
        stream = p.open(format=p.get_format_from_width(wf.getsampwidth()),
                        channels=wf.getnchannels(),
                        rate=wf.getframerate(),
                        output=True)
        data = wf.readframes(1024)
        while data:
            stream.write(data)
            data = wf.readframes(1024)

  
        stream.stop_stream()
        stream.close()
        p.terminate()
        logging.info("Audio played successfully.")
    except Exception as e:
        logging.error(f"Failed to play audio: {e}")


def main():
    # Initialise the model and processor
    processor, model = initialise_model()

    if processor and model:
        while True:
            # user input
            text = input("You: ")

            # Generate speech
            speech_values = generate_speech([text], processor, model)

            # Play speech audio
            if speech_values is not None:
                play_audio(speech_values[0])

if __name__ == "__main__":
    main()

INFO:root:Model and processor initialised successfully.


You:  hi


ERROR:root:Failed to generate speech: 'BarkProcessor' object has no attribute 'pad_token_id'


KeyboardInterrupt: Interrupted by user

In [12]:
from IPython.display import Audio
sampling_rate = model.generation_config.sample_rate
Audio(speech_values.cpu().numpy().squeeze(), rate=sampling_rate)

In [None]:
from transformers import BarkModel, AutoProcessor
import torch
from IPython.display import Audio

# Initialise the model and processor
model = BarkModel.from_pretrained("suno/bark-small")
processor = AutoProcessor.from_pretrained("suno/bark")

# check for GPU availability 
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model = model.to(device)

# set the sampling rate
sampling_rate = model.generation_config.sample_rate

def generate_speech(text_prompt):

    inputs = processor(text_prompt)
    

    speech_output = model.generate(**inputs.to(device))
    
    # audio object from the output
    audio = Audio(speech_output[0].cpu().numpy(), rate=sampling_rate)
    
    return audio

# conversation loop
while True:
    # Get a text prompt from the user
    text_prompt = input("You: ")
    
    # Check if the user wants to end the conversation
    if text_prompt.lower() in ["quit", "exit", "bye"]:
        print("Goodbye!")
        break
    
    # Generate speech from the text prompt
    audio = generate_speech(text_prompt)
    

    display(audio)

You:  hi


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.
