# Welcome to Hanasu TTS Trainer

In [None]:
import zipfile
with zipfile.ZipFile('wavs.zip', 'r') as zip_ref: zip_ref.extractall('zipfolder')

In [None]:
%cd /content/drive/MyDrive/hanasu/hanasu
!pip install -r requirements.txt
%cd /content/drive/MyDrive/vits2_pytorch-main/monotonic_align
!python setup.py build_ext --inplace
%cd ../
!apt-get update && apt-get install -y espeak-ng
!pip install phonemizer

# Preprocess the dataset

In [None]:
from text import preprocess_filelists
preprocess_filelists(["transcript.txt"])

# Fix Bugs

In [None]:
!pip install librosa==0.9.1
!pip install tensorboard==2.12.0 tensorflow==2.12.0
!pip install matplotlib==3.7.0 # This fixes: 'FigureCanvasAgg' object has no attribute 'tostring_rgb'
!pip install numpy==1.26.4

In [None]:
import os
directory = '/content/Yuna/wavs'
for filename in os.listdir(directory):
  if filename.endswith(".mel.pt"):
    os.remove(os.path.join(directory, filename))

In [None]:
# move all wavs from one directory into another
import shutil
import os

source_dir = 'source_folder'
destination_dir = 'destination_folder'

# Ensure the destination directory exists
os.makedirs(destination_dir, exist_ok=True)

# Iterate through files in the source directory
for filename in os.listdir(source_dir):
    # Check if the file is a wav file
    if filename.endswith(".wav"):
        source_path = os.path.join(source_dir, filename)
        destination_path = os.path.join(destination_dir, filename)
        # Move the file
        shutil.move(source_path, destination_path)

print(f"Moved all wav files from {source_dir} to {destination_dir}")

# Training

In [None]:
%cd /content/drive/MyDrive/hanasu

In [None]:
!python train_ms.py -c /content/drive/MyDrive/hanasu/configs/config.json -m Yuna

In [None]:
import os
os.runtime.unassigned_runtime_shutdown()

# Inference

In [None]:
%cd ../hanasu

In [None]:
from models import inference, load_model
from scipy.io.wavfile import write
import sounddevice as sd

In [None]:
text ="Text for generation."
# Or you can read from a file:
# with open("../output.txt", 'r', encoding='utf-8') as f: text = f.read()

In [None]:
model = load_model("./configs/config.json", "/Users/yuki/Downloads/G_158000.pth", device="mps")

In [None]:
# No streaming inference
result = inference(
    model=model,
    text=text,
    noise_scale=0.2,
    noise_scale_w=1.0,
    length_scale=1.0,
    device="mps",
    stream=False,
)

write(data=result, rate=48000, filename="sample_vits2.wav")

In [None]:
# Streaming inference
audio_generator = inference(
    model=model,
    text=text,
    noise_scale=0.17,
    noise_scale_w=1.0,
    length_scale=1.0,
    device="mps",
    stream=True,
)

for audio_chunk in audio_generator:
    # Play each chunk immediately as it's generated
    sd.play(audio_chunk, samplerate=48000)
    sd.wait()  # Wait for chunk to finish playing
    print(f"Played chunk of {len(audio_chunk)} samples")

In [None]:
# Voice Conversion Inference
from hanasu.models import voice_conversion_inference

# Perform voice conversion
converted_audio = voice_conversion_inference(
    model=model,
    source_wav_path="input.wav",
    source_speaker_id=2,
    target_speaker_id=1,
    device="mps"
)

write(data=converted_audio, rate=48000, filename="voice_converted_audio.wav")

# Export

In [None]:
from hanasu.onnx_utils import export_onnx, synthesize

export_onnx("/Users/yuki/Downloads/G_15000.pth", "/Users/yuki/Documents/Github/hanasu/hanasu/configs/config.json", "output.onnx")

In [None]:
synthesize("/Users/yuki/Documents/Github/hanasu/hanasu/output.onnx", "/Users/yuki/Documents/Github/hanasu/hanasu/configs/config.json", "output.wav", "Hello, world!", sid=0, scales=[0.2, 1.0, 1.0])