In [None]:
%cd ..

!uv pip install pandas
from inference import StyleTTS2

import librosa
import IPython.display as ipd
import torch.cuda

device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Load models

In [None]:
config_path = "Configs/config.yaml"
models_path = "Models/Finetune/base_model.pth"

### Synthesize speech

Little Note:

- You don't need to add language tokens everywhere, espeak can detect and handle them automatically most of the time.

- Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed.

In [None]:
speakers = {
    "id_1": {
        "path": "./Demo/reference_audio/vn_1.wav",  #Ref audio path
        "lang": "en-us",                            #Default language
        "speed": 1.0,                               #Speaking speed
    },
    "id_2": {
        "path": "./Demo/reference_audio/vn_2.wav",
        "lang": "en-us",
        "speed": 1.0,
    },
}
for id in speakers:
    max_samples = 24000*20 #max 20 seconds ref audio
    print(speakers[id]['path'])
    wave, sr = librosa.load(speakers[id]['path'], sr=24000)
    audio, index = librosa.effects.trim(wave, top_db=30)
    if sr != 24000:              audio = librosa.resample(audio, sr, 24000)
    if len(audio) > max_samples: audio = audio[:max_samples]
    display(ipd.Audio(audio, rate=24000, normalize=True))

In [None]:
text = '''
Next to the Bach Dang Port is the Nguyen Hue Avenue, as seen in a photo dated in the 1970s and as seen today. One side of the road leads to the Saigon River, while the other side leads to the Ho Chi Minh City People’s Committee, built from 1898 to 1909. The road spans 700 meters from the People’s Committee headquarters to the Bach Dang Port. In the past, it was the Kinh Lon Channel that brings water from the Saigon River to Gia Dinh. In 1887, the French covered up the channel to build a road, calling it the Chamer Avenue. By 1956, the then-government of the Republic of Vietnam changed the avenue’s name to Nguyen Hue. In 2004, Ho Chi Minh City restored the flower street on the avenue, and renovated it into a walking street as it is today in 2014.
'''

# Use phonemes directly
text = "ʃalˈom olˈam! mˈa korˈe? bˈo teʁˈed, toχˈal ktsˈat tˈeʁed. ʔˈejze tχinˈa! jihjˈe tχˈina tovˈa! bˈo niʃtˈe bˈiʁa beʔˈiʁ habiʁˈa! hˈu pitˈa ʔotˈi leʔeχˈol pˈita ʃawˈaʁma!"

In [None]:
model             = StyleTTS2(config_path, models_path).eval().to(device)
default_speaker   = "[id_1]"  #STR    Default speaker used when no speaker_id is provided in the input
avg_style         = True     #BOOL   Split the ref audio and calculate the avg styles.
stabilize         = True      #BOOL   Stabilize speaking speed.
denoise           = 0.6       #FLOAT  Adjust the strength of the denoiser. Value range is [0, 1]
n_merge           = 18        #INT    Avoid short sentences by merging when a sentence has fewer than n words

In [None]:
with torch.no_grad():
    styles = model.get_styles(speakers, denoise, avg_style)
    r = model.generate(text, styles, stabilize, n_merge, "[id_1]", is_phonemes = True) # default speaker

print('Synthesized:')
display(ipd.Audio(r, rate=24000, normalize=True))

import scipy.io.wavfile as wavfile
wavfile.write("audio.wav", 24000, (r * 32767).astype("int16"))
