In [1]:
# 查看GPU配置
# Check GPU configuration
!nvidia-smi

Tue Jan 23 22:33:23 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 546.65                 Driver Version: 546.65       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 4070 Ti   WDDM  | 00000000:01:00.0  On |                  N/A |
|  0%   34C    P8              13W / 285W |   3020MiB / 12282MiB |     14%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

##(2023/07/23) 这个笔记本参考[HWcomss](https://github.com/HWcomss)的版本修改而成，现已可以正常工作。
##(23/07/2023) This notebook is a slightly modified version of [HWcomss](https://github.com/HWcomss)'s notebook, it's working fine now. Many thanks!


In [None]:
#@title STEP 1 复制代码库并安装运行环境
#@markdown #STEP 1 (6 min)
#@markdown ##复制代码库并安装运行环境
#@markdown ##Clone repository & Build environment

!git clone https://github.com/Plachtaa/VITS-fast-fine-tuning.git
!python -m pip install --upgrade --force-reinstall regex
!python -m pip install --force-reinstall soundfile
!python -m pip install --force-reinstall gradio
!python -m pip install imageio==2.4.1
!python -m pip install --upgrade youtube-dl
!python -m pip install moviepy
%cd VITS-fast-fine-tuning

!python -m pip install --no-build-isolation -r requirements.txt
!python -m pip install --upgrade numpy
!python -m pip install --upgrade --force-reinstall numba
!python -m pip install --upgrade Cython

!python -m pip install --upgrade pyzmq
!python -m pip install pydantic==1.10.4
!python -m pip install ruamel.yaml
!python -m pip install git+https://github.com/openai/whisper.git
!python -m pip install gdown

# build monotonic align
%cd monotonic_align/
!mkdir monotonic_align
!python setup.py build_ext --inplace
%cd ..
!mkdir pretrained_models
# download data for fine-tuning
!wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/sampled_audio4ft_v2.zip
!unzip sampled_audio4ft_v2.zip
# create necessary directories
!mkdir video_data
!mkdir raw_audio
!mkdir denoised_audio
!mkdir custom_character_voice
!mkdir segmented_character_voice

!export LC_ALL="en_US.UTF-8"
!export LD_LIBRARY_PATH="/usr/lib64-nvidia"
!export LIBRARY_PATH="/usr/local/cuda/lib64/stubs"
!ldconfig /usr/lib64-nvidia

In [None]:
#@title STEP 1.5 选择预训练模型
#@markdown ###STEP 1.5 选择预训练模型
#@markdown ###Choose pretrained model to start
#@markdown CJE为中日英三语模型，CJ为中日双语模型，C为纯中文模型

#@markdown CJE for Chinese, Japanese & English model，CJ for Chinese & Japanese model
PRETRAINED_MODEL = "CJKE" # @param ["CJE", "CJ", "C", "CJKE"]
if PRETRAINED_MODEL == "CJ":
  !wget https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/model/D_0-p.pth -O ./pretrained_models/D_0.pth
  !wget https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/model/G_0-p.pth -O ./pretrained_models/G_0.pth
  !wget https://huggingface.co/spaces/sayashi/vits-uma-genshin-honkai/resolve/main/model/config.json -O ./configs/finetune_speaker.json
elif PRETRAINED_MODEL == "CJE":
  !wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/D_trilingual.pth -O ./pretrained_models/D_0.pth
  !wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/G_trilingual.pth -O ./pretrained_models/G_0.pth
  !wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/configs/uma_trilingual.json -O ./configs/finetune_speaker.json
elif PRETRAINED_MODEL == "C":
  !wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/VITS-Chinese/D_0.pth -O ./pretrained_models/D_0.pth
  !wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/VITS-Chinese/G_0.pth -O ./pretrained_models/G_0.pth
  !wget https://huggingface.co/datasets/Plachta/sampled_audio4ft/resolve/main/VITS-Chinese/config.json -O ./configs/finetune_speaker.json
elif PRETRAINED_MODEL == "CJKE":
  !wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/D_trilingual.pth -O ./pretrained_models/D_0.pth
  !wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/pretrained_models/G_trilingual.pth -O ./pretrained_models/G_0.pth
  !wget https://huggingface.co/spaces/Plachta/VITS-Umamusume-voice-synthesizer/resolve/main/configs/uma_trilingual.json -O ./configs/finetune_speaker.json
  !gdown --fuzzy https://drive.google.com/file/d/1nwbG7fM5kztBFD-BG2wOcRUF0FUrEFQ3/view?usp=drive_link -O /content/VITS-fast-fine-tuning/scripts/short_audio_transcribe.py
  !gdown --fuzzy https://drive.google.com/file/d/15TUws0mxZqovwHjp3ZjFnD5PTXOMl2sG/view?usp=drive_link -O /content/VITS-fast-fine-tuning/scripts/long_audio_transcribe.py
  !gdown --fuzzy https://drive.google.com/file/d/1_ohqDjFbVJi1upxyJHMUg5kvveHo6oRe/view?usp=drive_link -O /content/VITS-fast-fine-tuning/preprocess_v2.py
  !gdown --fuzzy https://drive.google.com/file/d/19BWvspGUMpw0wIzBLRkJCwKxh_1fvDYo/view?usp=sharing -O /content/VITS-fast-fine-tuning/VC_inference.py




In [None]:
#@title （可选）加载Google云端硬盘 / Mount Google drive
#@title (optional)

#@markdown 加载Google云端硬盘（更快地上传数据集文件）

#@markdown Mount Google drive for faster data upload
from google.colab import drive
drive.mount('/content/drive')

## STEP 2 上传您的角色音频数据
## Upload your character voices
见[数据集上传选项](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA.MD)
See [data upload options](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/DATA_EN.MD)


###STEP 2.1 上传短音频
### Short audio upload

In [None]:
#@markdown 上传选项1：运行该代码块会出现一个文件上传的入口

#@markdown Upload option 1: Running this code block will prompt you to upload a file.
%run scripts/voice_upload.py --type zip
!unzip ./custom_character_voice/custom_character_voice.zip -d ./custom_character_voice/

In [None]:
#@markdown 上传选项2：若您装载了Google云端硬盘，可以直接从Google云端硬盘加载文件。将`.zip`文件上传云端硬盘后，在下面填写文件路径：

#@markdown Upload option 2: If you have mounted Google drive, you can load your files from Google drive directly. After uploading your `.zip` file to Google drive, fill in the path to your file below:
ZIP_PATH = "../drive/MyDrive/samples.zip"  #@param {type:"string"}
!cp {ZIP_PATH} ./custom_character_voice/custom_character_voice.zip
!unzip ./custom_character_voice/custom_character_voice.zip -d ./custom_character_voice/

### STEP 2.2 上传长音频 （单个不应长于20分钟）
### Long audio upload

In [None]:
#@markdown 上传选项1：运行该代码块会出现一个文件上传的入口

#@markdown Upload option 1: Running this code block will prompt you to upload a file.
%run scripts/voice_upload.py --type audio

In [None]:
#@markdown 上传选项2：若您装载了Google云端硬盘，可以直接从Google云端硬盘加载文件。将所有长音频文件上传至云端硬盘的同一个文件夹下，在下面填写文件夹路径：

#@markdown Upload option 2: If you have mounted Google drive, you can load your files from Google drive directly. Put all the long audios under one folder, and fill in the path to your folder below:
AUDIO_FOLDER_PATH = "../drive/MyDrive/ren/"  #@param {type:"string"}
!cp {AUDIO_FOLDER_PATH}/* ./raw_audio/

### STEP 2.3 上传视频（单个不应长于20分钟）
### Video upload

In [None]:
#@markdown 上传选项1：运行该代码块会出现一个文件上传的入口

#@markdown Upload option 1: Running this code block will prompt you to upload a file.
%run scripts/voice_upload.py --type video

In [None]:
#@markdown 上传选项2：若您装载了Google云端硬盘，可以直接从Google云端硬盘加载文件。将所有视频文件上传至云端硬盘的同一个文件夹下，在下面填写文件夹路径:

#@markdown Upload option 2: If you have mounted Google drive, you can load your files from Google drive directly. Put all the videos under one folder, and fill in the path to your folder below:
VIDEO_FOLDER_PATH = "../drive/MyDrive/kane/"  #@param {type:"string"}
!cp {VIDEO_FOLDER_PATH}/* ./video_data/

### STEP 2.4 上传视频链接（单个不应长于20分钟）
### Video link upload

In [None]:
#@markdown 运行该代码块会出现一个文件上传的入口，上传单个`.txt`文件。若格式正确的话，视频会自动下载并将下载后的文件名打印在下方。

#@markdown Running this code block will prompt you to upload a file.
#@markdown Please upload a single `.txt` file. If you have put the links in the correct format,
#@markdown the videos will be automatically downloaded and displayed below.
%run scripts/download_video.py
!ls ./video_data/

## STEP 3 自动处理所有上传的数据

In [None]:
#@markdown 运行该单元格会对所有上传的数据进行自动去背景音&标注。
#@markdown 由于需要调用Whisper和Demucs，运行时间可能较长。

#@markdown Running this codeblock will perform automatic vocal seperation & annotation.
#@markdown Since this step uses Whisper & Demucs, it may take a while to complete.
# 将所有视频（无论是上传的还是下载的，且必须是.mp4格式）抽取音频
%run scripts/video2audio.py
# 将所有音频（无论是上传的还是从视频抽取的，必须是.wav格式）去噪
!python scripts/denoise_audio.py
# 分割并标注长音频
!python scripts/long_audio_transcribe.py --languages "{PRETRAINED_MODEL}" --whisper_size large-v2
# 标注短音频
!python scripts/short_audio_transcribe.py --languages "{PRETRAINED_MODEL}" --whisper_size large-v2
# 底模采样率可能与辅助数据不同，需要重采样
!python scripts/resample.py

#！！！训练质量相关：实验发现目前使用CJ模型+勾选ADD_AUXILIARY，对于中/日均能训练出最好的效果，第一次训练建议默认使用该组合！！！

In [None]:
#@markdown ##STEP 3.5
#@markdown 运行该单元格会生成划分好训练/测试集的最终标注，以及配置文件

#@markdown Running this block will generate final annotations for training & validation, as well as config file.

#@markdown 选择是否加入辅助训练数据：/ Choose whether to add auxiliary data:
ADD_AUXILIARY = True #@param {type:"boolean"}
#@markdown 辅助训练数据是从预训练的大数据集抽样得到的，作用在于防止模型在标注不准确的数据上形成错误映射。

#@markdown Auxiliary data is to prevent overfitting when the audio samples are small or with low quality.

#@markdown 以下情况请勾选：

#@markdown 总样本少于100条/样本质量一般或较差/样本来自爬取的视频

#@markdown 以下情况可以不勾选：

#@markdown 总样本量很大/样本质量很高/希望加速训练/只有二次元角色

# assert(not (ADD_AUXILIARY and PRETRAINED_MODEL != "CJE")), "add auxiliary data is available only available for CJE model!"
if ADD_AUXILIARY:
  %run preprocess_v2.py --add_auxiliary_data True --languages "{PRETRAINED_MODEL}"
else:
  %run preprocess_v2.py --languages "{PRETRAINED_MODEL}"

## STEP 4 开始训练

In [None]:
#@markdown #STEP 4 (>=20 min)
#@markdown 开始微调模型。
#@markdown 训练时长取决于你录入/上传的音频总数。

#@markdown 根据声线和样本质量的不同，所需的训练epochs数也不同。

#@markdown 你也可以在Tensorboard中预览合成效果，若效果满意可提前停止。

#@markdown Model fine-tuning
#@markdown Total time cost depends on the number of voices you recorded/uploaded.

#@markdown Best epoch number varies depending on different uploaded voices / sample quality.

#@markdown You can also preview synthezied audio in Tensorboard, it's OK to shut down training manually if you find the quality is satisfying.
import os
os.environ['TENSORBOARD_BINARY'] = '/usr/local/bin/tensorboard'

if os.path.exists("/content/drive/MyDrive/"):
  !python scripts/rearrange_speaker.py
  !cp ./finetune_speaker.json ../drive/MyDrive/finetune_speaker.json
  !cp ./moegoe_config.json ../drive/MyDrive/moegoe_config.json

%reload_ext tensorboard
%tensorboard --logdir "./OUTPUT_MODEL"
Maximum_epochs = "200" #@param {type:"string"}
#@markdown 继续之前的模型训练/Continue training from previous checkpoint
CONTINUE = False #@param {type:"boolean"}
if CONTINUE:
  !python finetune_speaker_v2.py -m "./OUTPUT_MODEL" --max_epochs "{Maximum_epochs}" --drop_speaker_embed False --cont True
else:
  !python finetune_speaker_v2.py -m "./OUTPUT_MODEL" --max_epochs "{Maximum_epochs}" --drop_speaker_embed True

In [None]:
!pip install gradio

In [None]:
!pip uninstall gradio --yes
!pip install gradio==3.50.2
#@markdown ### 微调完成后，在这里尝试效果。
#@markdown ### 运行后会输出一个public URL, 点击进入网页版UI以使用模型
#@markdown ### Try out TTS & VC quality here after fine-tuning is finished.
!cp ./configs/modified_finetune_speaker.json ./finetune_speaker.json
!python VC_inference.py --model_dir ./OUTPUT_MODEL/G_latest.pth --share True

In [None]:
!pip uninstall gradio
!pip install gradio==3.50.2

# STEP 5 下载模型
## 本地部署方法请见[README](https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/README_ZH.md)

In [None]:
#@markdown ### 下载选项1：运行该单元格，浏览器会自动下载模型和配置文件
#@markdown ### Download option 1: Running this codeblock will download model & config files by your browser.
!python scripts/rearrange_speaker.py
%run scripts/download_model.py

In [None]:
#@markdown ### 下载选项2：运行该单元格会将模型和配置文件保存到Google云端硬盘
#@markdown ### Download option 2: Running this codeblock will save the mode & config files to your Google drive.
!mkdir ../drive/MyDrive/MODEL12312
!python scripts/rearrange_speaker.py
!cp ./G_latest.pth ../drive/MyDrive/MODEL12312/G_latest.pth
!cp ./finetune_speaker.json ../drive/MyDrive/MODEL12312/finetune_speaker.json
!cp ./moegoe_config.json ../drive/MyDrive/MODEL12312/moegoe_config.json

In [None]:
!mkdir /content/drive/MyDrive/MODEL1

In [None]:
#@markdown ### 运行该单元格会清空所有已上传的样本，需要时可使用
#@markdown ### Running this codeblock will delete all voice samples you have uploaded. Use it if you need.
!rm -rf ./custom_character_voice/*
!rm -rf ./video_data/*
!rm -rf ./raw_audio/*
!rm -rf ./denoised_audio/*
!rm -rf ./segmented_character_voice/*
!rm -rf long_character_anno.txt
!rm -rf short_character_anno.txt

In [None]:
#@markdown ### 运行该单元格会将切片和标注复制到谷歌云端硬盘根目录下名为`voice_data`的文件夹下以用作其它用途
#@markdown ### Running this codeblock will copy all processed voices & annotations to a folder named `voice_data` under the root of Google Drive for other purpose of usage
!mkdir ../drive/MyDrive/voice_data/
!cp -rf ./custom_character_voice/ ../drive/MyDrive/voice_data/custom_character_voice/
!cp -rf ./segmented_character_voice/ ../drive/MyDrive/voice_data/segmented_character_voice/
!cp long_character_anno.txt ../drive/MyDrive/voice_data/long_character_anno.txt
!cp short_character_anno.txt ../drive/MyDrive/voice_data/short_character_anno.txt