# Demostration for components involved in the system

## Record

In [1]:
import threading
import sounddevice as sd
from scipy.io.wavfile import write
import numpy as np
import ipywidgets as widgets
from IPython.display import display
import time

fs = 44100  # Sample rate
channels = 1  # Mono recording
recording = np.array([])  # Placeholder for the recorded data
is_recording = False  # Flag to control recording

def record_audio(indata, frames, time, status):
    global recording
    recording = np.append(recording, indata.copy())

def update_recording_time():
    global is_recording
    # start_time = time.time()
    while is_recording:
        # elapsed_time = time.time() - start_time
        # time_label.value = f"Recording... {elapsed_time:.2f} seconds"
        time.sleep(0.1)  # Update every 100ms

def start_recording(button):
    global is_recording, recording
    recording = np.array([])  # Reset recording
    is_recording = True
    print("Recording started...")
    # Start recording in a separate thread to avoid blocking
    threading.Thread(target=lambda: sd.InputStream(callback=record_audio, channels=channels, samplerate=fs).start()).start()
    # Update recording time in a separate thread
    threading.Thread(target=update_recording_time).start()

def stop_recording(button):
    global is_recording
    is_recording = False
    sd.stop()
    print("Recording stopped.")
    # time_label.value = "Recording stopped."
    # Normalize to 16-bit range and save
    norm_audio = np.int16(recording / np.max(np.abs(recording)) * 32767)
    filename = 'speaker.wav'
    write(filename, fs, norm_audio)
    print(f"Audio saved as {filename}")

# Create buttons and label
start_button = widgets.Button(description="Start Recording")
stop_button = widgets.Button(description="Stop Recording")
# time_label = widgets.Label(value="Press 'Start Recording' to begin")

# Bind the buttons
start_button.on_click(start_recording)
stop_button.on_click(stop_recording)

# Display widgets
display(start_button, stop_button)



Button(description='Start Recording', style=ButtonStyle())

Button(description='Stop Recording', style=ButtonStyle())

Recording started...
Recording stopped.
Audio saved as speaker.wav


## Audio to texts

In [None]:
## Whisper English to English
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from datasets import load_dataset

# load model and processor
processor = WhisperProcessor.from_pretrained("openai/whisper-large")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-large")
model.config.forced_decoder_ids = None

# load dummy dataset and read audio files
ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
sample = ds[0]["audio"]
input_features = processor(sample["array"], sampling_rate=sample["sampling_rate"], return_tensors="pt").input_features 

# generate token ids
predicted_ids = model.generate(input_features)
# decode token ids to text
transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)

transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)

In [1]:
# Whisper usage
import whisper

def transcribe_audio(file_path):
    # Load the Whisper model
    model = whisper.load_model("base")

    # Load the audio file
    audio = whisper.load_audio(file_path)
    audio = whisper.pad_or_trim(audio)

    # Make a prediction
    result = model.transcribe(audio)

    # Return the transcribed text
    return result["text"]

# Example usage
file_path = "speaker.wav"
transcribed_text = transcribe_audio(file_path)
print("Transcribed Text:", transcribed_text)




Transcribed Text: 你好


## Texts to Texts translation

In [2]:
from transformers import pipeline
translator = pipeline("translation", model="Helsinki-NLP/opus-mt-zh-en")
translated_text = translator(transcribed_text, max_length=512)[0]['translation_text']

In [3]:
print("Translated Text:", translated_text)

Translated Text: Hello.


## Texts to speech

### XTTS-V2

In [None]:
# use the model directly
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

config = XttsConfig()
config.load_json("/path/to/xtts/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir="/path/to/xtts/", eval=True)
model.cuda()

outputs = model.synthesize(
    "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
    config,
    speaker_wav="/data/TTS-public/_refclips/3.wav",
    gpt_cond_len=3,
    language="en",
)

In [6]:
# Use the model from api
from IPython.display import Audio
from TTS.api import TTS
# tts = TTS("tts_models/multilingual/multi-dataset/your_tts", gpu=False)
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2", gpu=False)
# tts = TTS("tts_models/zh-CN/baker/tacotron2-DDC-GST")

# generate speech by cloning a voice using default settings
# tts.tts_to_file(text="Lucas's presenting issues include difficulties falling asleep, growing reluctance to attend school",
#                 file_path="output.wav",
#                 speaker_wav="speaker.wav",
#                 language="en")
# wav = tts.tts(text=translated_text, speaker_wav="speaker.wav", language="en")
wav = tts.tts(text="Hello, what 's up",speaker_wav = "speaker.wav", language="en")

Audio(wav, rate=22050)

 > tts_models/multilingual/multi-dataset/xtts_v2 is already downloaded.
 > Using model: xtts
 > Text splitted to sentences.
["Hello, what 's up"]
 > Processing time: 4.281552791595459
 > Real-time factor: 1.4126202875071803


### Parler TTS

In [7]:
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
import soundfile as sf
import torch

device = "cuda:0" if torch.cuda.is_available() else "cpu"

model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler_tts_mini_v0.1").to(device)
tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler_tts_mini_v0.1")

prompt = translated_text


description = "A male speaker with a slightly low-pitched voice delivers her words quite expressively, in a very confined sounding environment with clear audio quality. She speaks very fast."

input_ids = tokenizer(description, return_tensors="pt").input_ids.to(device)
prompt_input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)

generation = model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
audio_arr = generation.cpu().numpy().squeeze()
# sf.write("parler_tts_out.wav", audio_arr, model.config.sampling_rate)
sf.write("overview.wav", audio_arr, 22050)

Using the model-agnostic default `max_length` (=2580) to control the generation length. We recommend setting `max_new_tokens` to control the maximum length of the generation.


### Bark from SUNO-AI

In [None]:
# from bark import SAMPLE_RATE, generate_audio, preload_models
# from scipy.io.wavfile import write as write_wav
# from IPython.display import Audio

# # download and load all models
# preload_models()

# # print(SAMPLE_RATE)

# # generate audio from text
# text_prompt = """
#      Hello, my name is Suno. And, uh — and I like pizza. [laughs] 
#      But I also have other interests such as playing tic tac toe.
# """
# audio_array = generate_audio(text_prompt)

# # save audio to disk
# write_wav("bark_generation.wav", SAMPLE_RATE, audio_array)
  
# # play text in notebook
# Audio(audio_array, rate=SAMPLE_RATE)

In [None]:
# text_prompt = """
#      ♪ In the jungle, the mighty jungle, the lion barks tonight ♪
# """
# audio_array = generate_audio(text_prompt)

# # save audio to disk
# write_wav("bark_generation.wav", SAMPLE_RATE, audio_array)
  
# # play text in notebook
# Audio(audio_array, rate=SAMPLE_RATE)

## Test the mps

In [1]:
import torch
print(torch.__version__)
print("MPS available: ", torch.backends.mps.is_available())


2.4.0.dev20240410
MPS available:  True
