-
Notifications
You must be signed in to change notification settings - Fork 27
/
webui.py
375 lines (298 loc) · 16.8 KB
/
webui.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
# Copyright (c) 2024 Alibaba Inc (authors: Xiang Lyu)
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
os.environ['CURL_CA_BUNDLE'] = ''
os.environ['HF_DATASETS_OFFLINE'] = '1'
# os.environ['MODELSCOPE_CACHE'] ='./.cache/modelscope'
# os.environ['TORCH_HOME'] = './.cache/torch' #设置torch的缓存目录
# os.environ["HF_HOME"] = "./.cache/huggingface" #设置transformer的缓存目录
# os.environ['XDG_CACHE_HOME']="./.cache"
import sys
ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.append('{}/third_party/AcademiCodec'.format(ROOT_DIR))
sys.path.append('{}/third_party/Matcha-TTS'.format(ROOT_DIR))
import shutil
import argparse
import gradio as gr
import numpy as np
import torch
import torchaudio
import random
import librosa
import ffmpeg
import logging
logging.getLogger('matplotlib').setLevel(logging.WARNING)
from cosyvoice.cli.cosyvoice import CosyVoice
from cosyvoice.utils.file_utils import load_wav
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(levelname)s %(message)s')
def speed_change(input_audio: np.ndarray, speed: float, sr: int):
# 检查输入数据类型和声道数
if input_audio.dtype != np.int16:
raise ValueError("输入音频数据类型必须为 np.int16")
# 转换为字节流
raw_audio = input_audio.astype(np.int16).tobytes()
# 设置 ffmpeg 输入流
input_stream = ffmpeg.input('pipe:', format='s16le', acodec='pcm_s16le', ar=str(sr), ac=1)
# 变速处理
output_stream = input_stream.filter('atempo', speed)
# 输出流到管道
out, _ = (
output_stream.output('pipe:', format='s16le', acodec='pcm_s16le')
.run(input=raw_audio, capture_stdout=True, capture_stderr=True)
)
# 将管道输出解码为 NumPy 数组
processed_audio = np.frombuffer(out, np.int16)
return processed_audio
reference_wavs = ["请选择参考音频或者自己上传"]
for name in os.listdir("./参考音频/"):
reference_wavs.append(name)
spk_new = ["无"]
for name in os.listdir("./voices/"):
# print(name.replace(".pt",""))
spk_new.append(name.replace(".pt",""))
def refresh_choices():
spk_new = ["无"]
for name in os.listdir("./voices/"):
# print(name.replace(".pt",""))
spk_new.append(name.replace(".pt",""))
return {"choices":spk_new, "__type__": "update"}
def change_choices():
reference_wavs = ["选择参考音频,或者自己上传"]
for name in os.listdir("./参考音频/"):
reference_wavs.append(name)
return {"choices":reference_wavs, "__type__": "update"}
def change_wav(audio_path):
text = audio_path.replace(".wav","").replace(".mp3","").replace(".WAV","")
return f"./参考音频/{audio_path}",text
def save_name(name):
if not name or name == "":
gr.Info("音色名称不能为空")
return False
shutil.copyfile("./output.pt",f"./voices/{name}.pt")
gr.Info("音色保存成功,存放位置为voices目录")
def generate_seed():
seed = random.randint(1, 100000000)
return {
"__type__": "update",
"value": seed
}
def set_all_random_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
max_val = 0.8
def postprocess(speech, top_db=60, hop_length=220, win_length=440):
speech, _ = librosa.effects.trim(
speech, top_db=top_db,
frame_length=win_length,
hop_length=hop_length
)
if speech.abs().max() > max_val:
speech = speech / speech.abs().max() * max_val
speech = torch.concat([speech, torch.zeros(1, int(target_sr * 0.2))], dim=1)
return speech
inference_mode_list = ['预训练音色', '3s极速复刻', '跨语种复刻', '自然语言控制']
instruct_dict = {'预训练音色': '1. 选择预训练音色\n2.点击生成音频按钮',
'3s极速复刻': '1. 选择prompt音频文件,或录入prompt音频,若同时提供,优先选择prompt音频文件\n2. 输入prompt文本\n3.点击生成音频按钮',
'跨语种复刻': '1. 选择prompt音频文件,或录入prompt音频,若同时提供,优先选择prompt音频文件\n2.点击生成音频按钮',
'自然语言控制': '1. 输入instruct文本\n2.点击生成音频按钮'}
def change_instruction(mode_checkbox_group):
return instruct_dict[mode_checkbox_group]
def generate_audio(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed,speed_factor,new_dropdown):
if prompt_wav_upload is not None:
prompt_wav = prompt_wav_upload
elif prompt_wav_record is not None:
prompt_wav = prompt_wav_record
else:
prompt_wav = None
# if instruct mode, please make sure that model is speech_tts/CosyVoice-300M-Instruct and not cross_lingual mode
if mode_checkbox_group in ['自然语言控制']:
if cosyvoice.frontend.instruct is False:
gr.Warning('您正在使用自然语言控制模式, {}模型不支持此模式, 请使用speech_tts/CosyVoice-300M-Instruct模型'.format(args.model_dir))
return (target_sr, default_data)
if instruct_text == '':
gr.Warning('您正在使用自然语言控制模式, 请输入instruct文本')
return (target_sr, default_data)
if prompt_wav is not None or prompt_text != '':
gr.Info('您正在使用自然语言控制模式, prompt音频/prompt文本会被忽略')
# if cross_lingual mode, please make sure that model is speech_tts/CosyVoice-300M and tts_text prompt_text are different language
if mode_checkbox_group in ['跨语种复刻']:
if cosyvoice.frontend.instruct is True:
gr.Warning('您正在使用跨语种复刻模式, {}模型不支持此模式, 请使用speech_tts/CosyVoice-300M模型'.format(args.model_dir))
return (target_sr, default_data)
if instruct_text != '':
gr.Info('您正在使用跨语种复刻模式, instruct文本会被忽略')
if prompt_wav is None:
gr.Warning('您正在使用跨语种复刻模式, 请提供prompt音频')
return (target_sr, default_data)
gr.Info('您正在使用跨语种复刻模式, 请确保合成文本和prompt文本为不同语言')
# if in zero_shot cross_lingual, please make sure that prompt_text and prompt_wav meets requirements
if mode_checkbox_group in ['3s极速复刻', '跨语种复刻']:
if prompt_wav is None:
gr.Warning('prompt音频为空,您是否忘记输入prompt音频?')
return (target_sr, default_data)
if torchaudio.info(prompt_wav).sample_rate < prompt_sr:
gr.Warning('prompt音频采样率{}低于{}'.format(torchaudio.info(prompt_wav).sample_rate, prompt_sr))
return (target_sr, default_data)
# sft mode only use sft_dropdown
if mode_checkbox_group in ['预训练音色']:
if instruct_text != '' or prompt_wav is not None or prompt_text != '':
gr.Info('您正在使用预训练音色模式,prompt文本/prompt音频/instruct文本会被忽略!')
# zero_shot mode only use prompt_wav prompt text
if mode_checkbox_group in ['3s极速复刻']:
if prompt_text == '':
gr.Warning('prompt文本为空,您是否忘记输入prompt文本?')
return (target_sr, default_data)
if instruct_text != '':
gr.Info('您正在使用3s极速复刻模式,预训练音色/instruct文本会被忽略!')
if mode_checkbox_group == '预训练音色':
logging.info('get sft inference request')
set_all_random_seed(seed)
output = cosyvoice.inference_sft(tts_text,sft_dropdown,new_dropdown)
elif mode_checkbox_group == '3s极速复刻':
logging.info('get zero_shot inference request')
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
set_all_random_seed(seed)
output = cosyvoice.inference_zero_shot(tts_text, prompt_text, prompt_speech_16k)
elif mode_checkbox_group == '跨语种复刻':
logging.info('get cross_lingual inference request')
prompt_speech_16k = postprocess(load_wav(prompt_wav, prompt_sr))
set_all_random_seed(seed)
output = cosyvoice.inference_cross_lingual(tts_text, prompt_speech_16k)
else:
logging.info('get instruct inference request')
set_all_random_seed(seed)
output = cosyvoice.inference_instruct(tts_text, sft_dropdown, instruct_text,new_dropdown)
if speed_factor != 1.0:
try:
numpy_array = output['tts_speech'].numpy()
audio = (numpy_array * 32768).astype(np.int16)
audio_data = speed_change(audio, speed=speed_factor, sr=int(target_sr))
except Exception as e:
print(f"Failed to change speed of audio: \n{e}")
else:
audio_data = output['tts_speech'].numpy().flatten()
return (target_sr, audio_data)
def generate_audio_stream(tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed,speed_factor,new_dropdown):
if mode_checkbox_group != '预训练音色':
gr.Warning('流式推理只支持预训练音色推理')
return (target_sr, default_data)
# logging.info('get sft inference request')
# set_all_random_seed(seed)
# # output = next(cosyvoice.inference_sft_stream(tts_text,sft_dropdown,new_dropdown))
# yield output
spk_id = sft_dropdown
if new_dropdown != "无":
spk_id = "中文女"
joblist = cosyvoice.frontend.text_normalize_stream(tts_text, split=True)
for i in joblist:
print(i)
tts_speeches = []
model_input = cosyvoice.frontend.frontend_sft(i, spk_id)
if new_dropdown != "无":
# 加载数据
print(new_dropdown)
print("读取pt")
newspk = torch.load(f'./voices/{new_dropdown}.pt')
# with open(f'./voices/{new_dropdown}.py','r',encoding='utf-8') as f:
# newspk = f.read()
# newspk = eval(newspk)
model_input["flow_embedding"] = newspk["flow_embedding"]
model_input["llm_embedding"] = newspk["llm_embedding"]
model_input["llm_prompt_speech_token"] = newspk["llm_prompt_speech_token"]
model_input["llm_prompt_speech_token_len"] = newspk["llm_prompt_speech_token_len"]
model_input["flow_prompt_speech_token"] = newspk["flow_prompt_speech_token"]
model_input["flow_prompt_speech_token_len"] = newspk["flow_prompt_speech_token_len"]
model_input["prompt_speech_feat_len"] = newspk["prompt_speech_feat_len"]
model_input["prompt_speech_feat"] = newspk["prompt_speech_feat"]
model_input["prompt_text"] = newspk["prompt_text"]
model_input["prompt_text_len"] = newspk["prompt_text_len"]
model_output = next(cosyvoice.model.inference_stream(**model_input))
# print(model_input)
tts_speeches.append(model_output['tts_speech'])
output = torch.concat(tts_speeches, dim=1)
if speed_factor != 1.0:
try:
numpy_array = output.numpy()
audio = (numpy_array * 32768).astype(np.int16)
audio_data = speed_change(audio, speed=speed_factor, sr=int(target_sr))
except Exception as e:
print(f"Failed to change speed of audio: \n{e}")
else:
audio_data = output.numpy().flatten()
yield (target_sr, audio_data)
def main():
with gr.Blocks() as demo:
gr.Markdown("### 代码库 [CosyVoice](https://github.com/FunAudioLLM/CosyVoice) 预训练模型 [CosyVoice-300M](https://www.modelscope.cn/models/speech_tts/CosyVoice-300M) [CosyVoice-300M-Instruct](https://www.modelscope.cn/models/speech_tts/CosyVoice-300M-Instruct) [CosyVoice-300M-SFT](https://www.modelscope.cn/models/speech_tts/CosyVoice-300M-SFT)")
gr.Markdown("#### 请输入需要合成的文本,选择推理模式,并按照提示步骤进行操作")
tts_text = gr.Textbox(label="输入合成文本", lines=1, value="我是通义实验室语音团队全新推出的生成式语音大模型,提供舒适自然的语音合成能力。")
speed_factor = gr.Slider(minimum=0.25,maximum=4,step=0.05,label="语速调节",value=1.0,interactive=True)
with gr.Row():
mode_checkbox_group = gr.Radio(choices=inference_mode_list, label='选择推理模式', value=inference_mode_list[0])
instruction_text = gr.Text(label="操作步骤", value=instruct_dict[inference_mode_list[0]], scale=0.5)
sft_dropdown = gr.Dropdown(choices=sft_spk, label='选择预训练音色', value=sft_spk[0], scale=0.25)
new_dropdown = gr.Dropdown(choices=spk_new, label='选择新增音色', value=spk_new[0],interactive=True)
refresh_new_button = gr.Button("刷新新增音色")
refresh_new_button.click(fn=refresh_choices, inputs=[], outputs=[new_dropdown])
with gr.Column(scale=0.25):
seed_button = gr.Button(value="\U0001F3B2")
seed = gr.Number(value=0, label="随机推理种子")
with gr.Row():
wavs_dropdown = gr.Dropdown(label="参考音频列表",choices=reference_wavs,value="请选择参考音频或者自己上传",interactive=True)
refresh_button = gr.Button("刷新参考音频")
refresh_button.click(fn=change_choices, inputs=[], outputs=[wavs_dropdown])
prompt_wav_upload = gr.Audio(sources='upload', type='filepath', label='选择prompt音频文件,注意采样率不低于16khz')
prompt_wav_record = gr.Audio(sources='microphone', type='filepath', label='录制prompt音频文件')
prompt_text = gr.Textbox(label="输入prompt文本", lines=1, placeholder="请输入prompt文本,需与prompt音频内容一致,暂时不支持自动识别...", value='')
instruct_text = gr.Textbox(label="输入instruct文本", lines=1, placeholder="请输入instruct文本.", value='')
new_name = gr.Textbox(label="输入新的音色名称", lines=1, placeholder="输入新的音色名称.", value='')
save_button = gr.Button("保存刚刚推理的zero-shot音色")
save_button.click(save_name, inputs=[new_name])
wavs_dropdown.change(change_wav,[wavs_dropdown],[prompt_wav_upload,prompt_text])
generate_button = gr.Button("生成音频")
generate_button_stream = gr.Button("流式生成")
# audio_output = gr.Audio(label="合成音频")
audio_output = gr.Audio(label="合成音频",value=None,
streaming=True,
autoplay=True, # disable auto play for Windows, due to https://developer.chrome.com/blog/autoplay#webaudio
interactive=False,
show_label=True,show_download_button=True)
# result2 = gr.Textbox(label="翻译结果(会在项目目录生成two.srt/two.srt is generated in the current directory)")
seed_button.click(generate_seed, inputs=[], outputs=seed)
generate_button.click(generate_audio,
inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed,speed_factor,new_dropdown],
outputs=[audio_output])
generate_button_stream.click(generate_audio_stream,
inputs=[tts_text, mode_checkbox_group, sft_dropdown, prompt_text, prompt_wav_upload, prompt_wav_record, instruct_text, seed,speed_factor,new_dropdown],
outputs=[audio_output])
mode_checkbox_group.change(fn=change_instruction, inputs=[mode_checkbox_group], outputs=[instruction_text])
demo.queue(max_size=4, default_concurrency_limit=2)
demo.launch(server_port=args.port,inbrowser=True)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--port',
type=int,
default=8000)
parser.add_argument('--model_dir',
type=str,
default='speech_tts/CosyVoice-300M',
help='local path or modelscope repo id')
args = parser.parse_args()
cosyvoice = CosyVoice(args.model_dir)
sft_spk = cosyvoice.list_avaliable_spks()
prompt_sr, target_sr = 16000, 22050
default_data = np.zeros(target_sr)
main()