In [1]:
!nvidia-smi

Sat Jun  1 17:07:54 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 551.61                 Driver Version: 551.61         CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 4060 ...  WDDM  |   00000000:01:00.0  On |                  N/A |
| N/A   51C    P5              9W /  140W |    1761MiB /   8188MiB |     16%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [16]:
import torch
torch._dynamo.config.cache_size_limit = 64
torch._dynamo.config.suppress_errors = True
torch.set_float32_matmul_precision('high')

import ChatTTS
from IPython.display import Audio

## Load Models

In [17]:
chat = ChatTTS.Chat()
chat.load_models(source='local', local_path='.\ChatTTS')  

# Use force_redownload=True if the weights updated.
# chat.load_models(force_redownload=True)

# If you download the weights manually, set source='locals'.
# chat.load_models(source='local', local_path='YOUR LOCAL PATH')

INFO:ChatTTS.core:Load from local: .\ChatTTS
INFO:ChatTTS.core:use cuda:0
INFO:ChatTTS.core:vocos loaded.
INFO:ChatTTS.core:dvae loaded.
INFO:ChatTTS.core:gpt loaded.
INFO:ChatTTS.core:decoder loaded.
INFO:ChatTTS.core:tokenizer loaded.
INFO:ChatTTS.core:All initialized.


## Inference

### Batch infer

In [6]:
texts = ["Hello everyone, I'm Mu You. Today, I'd like to share an interesting TTS project with you. In just three days, the project has already reached 11.7K stars.",]*3 \
        + ["大家好，我是沐柚，今天给大家分享一个好玩的TTS项目，这个项目在短短三天内，star数量已经来到了十一点七K。"]*3     
        
wavs = chat.infer(texts)

INFO:ChatTTS.core:All initialized.
 16%|████████████████████▋                                                                                                               | 60/384 [00:01<00:08, 38.49it/s]
 25%|████████████████████████████████▍                                                                                                 | 511/2048 [00:09<00:29, 52.41it/s]


In [7]:
Audio(wavs[2], rate=24_000, autoplay=False)

In [8]:
Audio(wavs[4], rate=24_000, autoplay=False)

In [9]:
Audio(wavs[5], rate=24_000, autoplay=False)

### Custom params

In [29]:
params_infer_code = {'prompt':'[speed_5]', 'temperature':.3}
params_refine_text = {'prompt':'[oral_2][laugh_0][break_6]'} #这里加随机的断句词
wav = chat.infer('永远不要提前焦虑，也不要提前预知烦恼，更不要被当下的情绪所消耗，那样会对你的身体造成很大的伤害，你只需要好好享受当下。车到山前必有路，关关难过关关过，揪着过去不放，不要不依不饶, 画地为牢，你要允许一切发生，调整好情绪，毕竟我们无法左右的事情太多，握不住的东西和留不住的人也太多，尽心尽力，顺其自然就好！', \
    params_refine_text=params_refine_text, params_infer_code=params_infer_code)

INFO:ChatTTS.core:All initialized.
 42%|███████████████████████████████████████████████████████▎                                                                           | 162/384 [00:03<00:04, 48.54it/s]
 65%|████████████████████████████████████████████████████████████████████████████████████▎                                            | 1338/2048 [00:26<00:14, 50.67it/s]


In [None]:
Audio(wav[0], rate=24_000, autoplay=False)

### fix random speaker

In [30]:
rand_spk = chat.sample_random_speaker()
params_infer_code = {'spk_emb' : rand_spk, }
params_refine_text = {'prompt':'[oral_2][laugh_0][break_6]'} #这里加随机的断句词
wav = chat.infer('不要把时间浪费在尚未发生且没有必要的人和事上，更不要在焦虑和烦恼中度过，你要把时间分给睡眠，分给书籍，分给运动，分给音乐，分给花鸟树木山川湖泊，当你开始做时间的主人，那些焦虑和烦恼自然就烟消云散了。', \
    params_refine_text=params_refine_text, params_infer_code=params_infer_code)

INFO:ChatTTS.core:All initialized.
 28%|████████████████████████████████████▏                                                                                              | 106/384 [00:02<00:06, 43.54it/s]
 46%|███████████████████████████████████████████████████████████▉                                                                      | 944/2048 [00:22<00:26, 42.06it/s]


In [None]:
Audio(wav[0], rate=24_000, autoplay=False)

### Two stage control

In [None]:
text = "我在背后骂人说明他人品有问题 别人在背后骂我更说明他人品有问题 我骂的能是什么好东西骂我的又能是什么好东西 我就是古希腊掌管道德的神 "
chat.infer(text, refine_text_only=True)   #这里只是显示断句结果

In [None]:
text= [ "我 在 背 后 骂 人 [uv_break] 说 明 他 人 品 有 问 题 别 人 在 背 后 骂 我 更 说 明 他 人 品 有 问 题 [uv_break] 那 我 骂 的 能 是 什 么 好 东 西 [uv_break] 骂 我 的 又 能 是 什 么 好 东 西 [uv_break]  我 就 是 古 希 腊 掌 管 道 德 的 神 [laugh] "]*3
wav = chat.infer(text, skip_refine_text=True)   #设定不要自行断句，这里把上一行命令的断句后的文本粘贴到这里，不合适的地方进行修改。

In [None]:
Audio(wav[0], rate=24_000, autoplay=False)

In [None]:
Audio(wav[1], rate=24_000, autoplay=False)

In [None]:
Audio(wav[2], rate=24_000, autoplay=False)

##Only Skip_Refine_Rext

In [22]:
text=["然后跟演员说[uv_break][uv_break]最有意思的是跟演员说[uv_break]跟千玺说不一镜到底[uv_break][uv_break]哎呀[uv_break][uv_break]哎呀[uv_break][uv_break]就觉着好像就没劲了[laugh] [uv_break][laugh][uv_break][uv_break]跟沈腾说不能一镜到底[uv_break]哎呀就觉得[uv_break]好像就不想演了一样[uv_break][uv_break][uv_break]开玩笑啊[uv_break]开玩笑[laugh][uv_break]就所有人都沮丧而失落[laugh][uv_break][laugh]就我看到大家的反应[uv_break]我也很难过[uv_break][uv_break]就知道吧，很难过[laugh]"]*5
wav = chat.infer(text, skip_refine_text=True)

INFO:ChatTTS.core:All initialized.
 64%|██████████████████████████████████████████████████████████████████████████████████▋                                              | 1312/2048 [00:26<00:14, 49.62it/s]


In [24]:
Audio(wav[0], rate=24_000, autoplay=False)

In [25]:
Audio(wav[1], rate=24_000, autoplay=False)

In [26]:
Audio(wav[2], rate=24_000, autoplay=False)

## LLM Call

In [25]:
from ChatTTS.experimental.llm import llm_api

API_KEY = 'sk-26d9a36137854bdd98cbd3d187d21479'
client = llm_api(api_key=API_KEY,
        base_url="https://api.deepseek.com",
        model="deepseek-chat")

In [26]:
rand_spk = chat.sample_random_speaker()
user_question = '四川有哪些好吃的美食呢?'
text = client.call(user_question, prompt_version = 'deepseek')
print(text)
# text = client.call(text, prompt_version = 'deepseek_TN')
# print(text)

INFO:httpx:HTTP Request: POST https://api.deepseek.com/chat/completions "HTTP/1.1 200 OK"


四川美食可多了, 有麻辣火锅、宫保鸡丁、回锅肉、麻婆豆腐、担担面、夫妻肺片、串串香、龙抄手、宜宾燃面、乐山棒棒鸡等。这些美食口味独特, 辣中带香, 让人回味无穷。


In [27]:
params_infer_code = {'spk_emb' : rand_spk, 'temperature':.3}

wav = chat.infer(text, params_infer_code=params_infer_code)

INFO:ChatTTS.core:All initialized.
 23%|█████████████████████████████▉                                                                                                      | 87/384 [00:01<00:05, 50.18it/s]
 39%|██████████████████████████████████████████████████▌                                                                               | 796/2048 [00:14<00:22, 55.68it/s]


In [28]:
Audio(wav[0], rate=24_000, autoplay=False)