In [4]:
import os
import re
import torch
from transformers import T5EncoderModel, AutoTokenizer
from pathlib import Path
from f5_tts.infer.utils_infer import (
    infer_process,
    load_vocoder,
    preprocess_ref_audio_text,
    remove_silence_for_generated_wav,
)
from f5_tts.model.cfm import T5Conditioner
from f5_tts.model import DiT, CFM
from f5_tts.model.utils import get_tokenizer
from f5_tts.train.utils import make_html
import torchaudio
import argparse

TRANSFORMERS_NO_TORCHVISION=1

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if torch.cuda.is_available():
    torch.set_float32_matmul_precision("high")

cfg_strength = 3.0
scale_phi = 0.75
ckpt_path = '/workspace/f5tts_clone_qwen_filter_7.pt'

mel_spec_type = "vocos"
vocoder_name = "vocos"
target_sample_rate = 24000
n_mel_channels = 100
hop_length = 256
win_length = 1024
n_fft = 1024
target_rms = 0.1
cross_fade_duration = 0.15
ode_method = "euler"
nfe_step = 32  # 16, 32
sway_sampling_coef = -1.0
speed = 1.0
fix_duration = None
vocab_file = '/workspace/tts/ckpts/vocab.txt'
tokenizer = "custom"
ode_method = "euler"

# load model
model_cls = DiT
model_cfg = dict(
    dim=1024, 
    depth=22, 
    heads=16, 
    ff_mult=2, 
    text_dim=512, 
    conv_layers=4
)

vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=True, local_path="./f5_tts/vocoder")

transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels)
text_conditioner = T5Conditioner(t5_model_name="t5-base", max_length=32).to(device)
text_conditioner.eval()

mel_spec_kwargs=dict(
    n_fft=n_fft,
    hop_length=hop_length,
    win_length=win_length,
    n_mel_channels=n_mel_channels,
    target_sample_rate=target_sample_rate,
    mel_spec_type=mel_spec_type,
)

odeint_kwargs=dict(
    method=ode_method,
)

model = CFM(
    transformer=transformer,
    mel_spec_kwargs=mel_spec_kwargs,
    odeint_kwargs=odeint_kwargs,
    vocab_char_map=vocab_char_map,
).to(device)

dtype = torch.float32
checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True)
model.load_state_dict(checkpoint, strict=False)
del checkpoint
torch.cuda.empty_cache()

Load vocos from local path ./f5_tts/vocoder


In [8]:
script = "Hello everyone"
prefix_path = "/workspace/tts/src/zombie-or-monster-says-i-sound-effect-079567563_nw_prev.mp3"
prefix_script = "I hate you so much"

if prefix_path is not None and prefix_script is not None:
    main_voice = {"ref_audio": prefix_path, "ref_text": prefix_script}
    voices = {"main": main_voice}
    
    for voice in voices:
        voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
            voices[voice]["ref_audio"], voices[voice]["ref_text"]
        )
    ref_audio = voices[voice]["ref_audio"]
    ref_text = voices[voice]["ref_text"]
    no_ref_audio = False
else:
    ref_audio = None
    ref_text = " "
    no_ref_audio = True

audio, final_sample_rate, spectragram = infer_process(
    ref_audio,
    ref_text,
    script, 
    model, 
    vocoder, 
    mel_spec_type=mel_spec_type, 
    speed=speed,
    cfg_strength=cfg_strength,
    no_ref_audio=no_ref_audio,
)
if vocoder_name == "bigvgan":
    array = torch.stack((torch.tensor(audio), torch.tensor(audio)), dim=0).squeeze()
else:
    array = torch.stack((torch.tensor(audio).unsqueeze(dim=0), torch.tensor(audio).unsqueeze(dim=0)), dim=0).squeeze()

Converting audio...
Using custom reference text...
gen_text 0 Hello everyone
Generating audio in 1 batches...


  0%|          | 0/1 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.772 seconds.
Prefix dict has been built successfully.


before start :  torch.Size([1, 69910]) [['I', ' ', 'h', 'a', 't', 'e', ' ', 'y', 'o', 'u', ' ', 's', 'o', ' ', 'm', 'u', 'c', 'h', '.', ' ', ' ', 'H', 'e', 'l', 'l', 'o', ' ', 'e', 'v', 'e', 'r', 'y', 'o', 'n', 'e']] 455 32 False False 1


100%|██████████| 1/1 [00:02<00:00,  2.61s/it]


In [None]:
from audiotools import AudioSignal

caption = ""
script = "You are not allowed to come here! "

# generate 
prefix_path = "/workspace/tts_sfx/src/f5_tts/valid_data/zombie-or-monster-says-i-sound-effect-079567563_nw_prev.mp3"
prefix_script = "I hate you so much"

# AudioSignal(prefix_path, sample_rate=24000).to_mono().widget()

for i in range(3):
    audio = main(script=script)
    AudioSignal(audio, sample_rate=24000).widget()

In [None]:
caption = "a monster saying"
script = "I hate you so much"
prefix_path = "/workspace/tts_sfx/src/f5_tts/valid_data/zombie-or-monster-says-i-sound-effect-079567563_nw_prev.mp3"
prefix_script = "I hate you so much"

AudioSignal(prefix_path, sample_rate=24000).to_mono().widget()
for i in range(10):
    audio = main(prefix_path, prefix_script, caption, script)
    AudioSignal(audio, sample_rate=24000).widget()

In [None]:
from audiotools import AudioSignal
import librosa
import torch

ap = 'zombie-or-monster-says-i-sound-effect-079567563_nw_prev.mp3'
AudioSignal(ap).widget()

audio, sr = librosa.load(ap, sr=24000)

audio = torch.tensor(audio)[24000:48000]
print(audio.shape)

ms = model.mel_spec(audio.unsqueeze(dim=0))
print(ms.shape)

In [None]:
mss = ms.permute(0, 2, 1)

generated_wave = vocoder.decode(ms.to(device))

In [None]:
AudioSignal(generated_wave.cpu().numpy(), sample_rate=24000).widget()

In [1]:
import os
import re
import torch
from transformers import T5EncoderModel, AutoTokenizer
from pathlib import Path
from f5_tts.infer.utils_infer import (
    infer_process,
    load_vocoder,
    preprocess_ref_audio_text,
    remove_silence_for_generated_wav,
)
from f5_tts.model.cfm import T5Conditioner
from f5_tts.model import DiTPrepend, CFM
from f5_tts.model.utils import get_tokenizer
from f5_tts.train.utils import make_html
import torchaudio
import argparse
import requests

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
if torch.cuda.is_available():
    torch.set_float32_matmul_precision("high")

cfg_strength = 2.0
scale_phi = 0.75

mel_spec_type = "vocos"
vocoder_name = mel_spec_type
target_sample_rate = 24000
n_mel_channels = 100
hop_length = 256
win_length = 1024
n_fft = 1024
target_rms = 0.1
cross_fade_duration = 0.15
ode_method = "euler"
nfe_step = 32  # 16, 32
sway_sampling_coef = -1.0
speed = 1.0
fix_duration = None
vocab_file = "./f5_tts/infer/examples/vocab.txt"
tokenizer = "custom"
ode_method = "euler"

# load model
model_cls = DiTPrepend
model_cfg = dict(
    dim=1024, 
    depth=22, 
    heads=16, 
    ff_mult=2, 
    text_dim=512, 
    conv_layers=4
)

In [2]:
vocab_char_map, vocab_size = get_tokenizer(vocab_file, tokenizer)
if vocoder_name == "bigvgan":
    vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=False)
else:
    vocoder = load_vocoder(vocoder_name=vocoder_name, is_local=True, local_path=f"./f5_tts/vocoder")

transformer=model_cls(**model_cfg, text_num_embeds=vocab_size, mel_dim=n_mel_channels)
text_conditioner = T5Conditioner(t5_model_name="t5-base", max_length=32).to(device)
text_conditioner.eval()

mel_spec_kwargs=dict(
    n_fft=n_fft,
    hop_length=hop_length,
    win_length=win_length,
    n_mel_channels=n_mel_channels,
    target_sample_rate=target_sample_rate,
    mel_spec_type=mel_spec_type,
)

odeint_kwargs=dict(
    method=ode_method,
)

print("Load 2")
model = CFM(
    transformer=transformer,
    mel_spec_kwargs=mel_spec_kwargs,
    odeint_kwargs=odeint_kwargs,
    vocab_char_map=vocab_char_map,
).to(device)

dtype = torch.float32

print("Load 3")
ckpt_path = "/workspace/f5tts_clone_qwen_filter_7.pt"
checkpoint = torch.load(ckpt_path, map_location=device, weights_only=True)
model.load_state_dict(checkpoint)

del checkpoint
torch.cuda.empty_cache()

Load vocos from local path ./f5_tts/vocoder
Load 2
Load 3


In [3]:
# generate #
t_inter = 0.0
duplicate_test = False
if input_audio_url is not None and not is_variation:
    # Use this voice
    file_name = input_audio_url.split("/")[-1]
    file_path = f'./src/reflist/{file_name}'
    download_file(input_audio_url, file_path)
    prefix_script = original_script

main_voice = {"ref_audio": file_path, "ref_text": prefix_script}
voices = {"main": main_voice}
for voice in voices:
    voices[voice]["ref_audio"], voices[voice]["ref_text"] = preprocess_ref_audio_text(
        voices[voice]["ref_audio"], voices[voice]["ref_text"]
    )

audio, final_sample_rate, spectragram = infer_process(
    voices[voice]["ref_audio"],
    voices[voice]["ref_text"],
    script, 
    model, 
    vocoder, 
    mel_spec_type=mel_spec_type, 
    speed=speed,
    cfg_strength=cfg_strength,
    no_ref_audio=False,
    scale_phi=scale_phi,
    t_inter=t_inter,
    duplicate_test=duplicate_test,
    batch_size=5
)

print("Adua ", audio.shape)
if vocoder_name == "bigvgan":
    array = torch.stack((torch.tensor(audio), torch.tensor(audio)), dim=0).squeeze()
else:
    array = torch.stack((torch.tensor(audio).unsqueeze(dim=1), torch.tensor(audio).unsqueeze(dim=1)), dim=1).squeeze()
print(array.shape)

for idx, aud in enumerate(array.to(dtype).cpu().detach()):
    print(aud.shape)
    torchaudio.save(
        f"sample_{idx}.wav", aud, sample_rate=target_sample_rate, channels_first=True
    )
delete_file(file_path)

array = array.to(dtype).cpu().detach().numpy()
return array.reshape(-1) # [batch_size, 2, audio_length]

NameError: name 'input_audio_url' is not defined