Skip to content

Commit

Permalink
Merge pull request #34 from yt605155624/yt_base
Browse files Browse the repository at this point in the history
Add dataloader for LibriLight 6k
  • Loading branch information
yangdongchao committed Aug 11, 2023
2 parents f4158e3 + 5db1ffa commit 71d6a3c
Show file tree
Hide file tree
Showing 25 changed files with 897 additions and 200 deletions.
3 changes: 2 additions & 1 deletion egs_s2/LJSpeech/conf/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ solver:

dataloader:
max_token_one_batch: 96000 # 影响单卡显存占用, 96k for 80G GPU (A100) (LJSpeech)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
params:
Expand Down
40 changes: 25 additions & 15 deletions egs_s2/LibriLight/conf/default.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# config for LibriLight 6k (small + medium)
# 30k_basex1_hubert_L7km300
model:
target: soundstorm.s2.models.dalle_wav.dalle_wav.DALLE
Expand Down Expand Up @@ -27,6 +28,10 @@ model:
timestep_type: 'adalayernorm' # adainsnorm or adalayernorm and abs
mlp_hidden_times: 4
semantic_token_nums: 300
prompt_semantic_emb_len: 10 # should > max_prompt_sec in dataset
target_semantic_emb_len: 30 # should > max_target_sec in dataset
prompt_acoustic_emb_len: 10 # can be same with prompt_semantic
target_acoustic_emb_len: 30 # can be same with target_semantic
content_emb_config:
target: soundstorm.s2.models.dalle_wav.mask_embedding.DalleMaskImageEmbedding
params:
Expand All @@ -37,11 +42,11 @@ model:
pos_emb_type: embedding

solver:
base_lr: 0.3e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
save_epochs: 1
dev_epochs: 1
base_lr: 0.3e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_iters: 550000 # 55w iter for 8 GPU for small + medium, ~70 epochs, ~7.8k iter/epoch, training 2.5h/epoch
save_iters: 1500 # 1.5k, ~ cost 0.3h to save a ckpt
dev_iters: 1500 # num of iter for each gpu, for 8 gpu here, should x2 when use 4 gpus to make model see same number of samples
ema:
decay: 0.99
update_interval: 25
Expand Down Expand Up @@ -69,25 +74,30 @@ solver:
threshold: 1.0e-1
threshold_mode: rel
warmup_lr: 0.45e-3 # the lr to be touched after warmup
warmup: 800 # ~ 2 epoch
warmup: 800 # num of iter to warmup

dataloader:
max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
- target: soundstorm.s2.data.semantic_dataset_librilight_6k.SemanticDataset
params:
codec_name: hificodec
num_quant: 4 # not work when != 4 for hificodec, and can be 3 for soundstream and encodec
semantic_token_nums: 300 # 1000 for mhubert 500 for en_hubert
semantic_path: dump/train/semantic_token.tsv
acoustic_path: dump/train/acoustic_token/hificodec.pth
semantic_token_nums: 300 # same with num of kmeans bins
max_prompt_sec: 3 # be same with LibriTTS
max_target_sec: 20 # LibriTTS is 10, use 20 here for longer TTS
semantic_dirs: ['dump/small/train/']
acoustic_dirs: ['dump/small/train/acoustic_token/']

dev_datasets:
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
- target: soundstorm.s2.data.semantic_dataset_librilight_6k.SemanticDataset
params:
codec_name: hificodec
num_quant: 4 # not work when != 4 for hificodec, and can be 3 for soundstream and encodec
semantic_token_nums: 300 # 1000 for mhubert 500 for en_hubert
semantic_path: dump/dev/semantic_token.tsv
acoustic_path: dump/dev/acoustic_token/hificodec.pth
semantic_token_nums: 300 # same with num of kmeans bins
max_prompt_sec: 3 # be same with LibriTTS
max_target_sec: 20 # LibriTTS is 10, use 20 here for longer TTS
semantic_dirs: ['dump/small/dev/']
acoustic_dirs: ['dump/small/dev/acoustic_token/']
93 changes: 0 additions & 93 deletions egs_s2/LibriLight/conf/small_medium_iter.yaml

This file was deleted.

1 change: 0 additions & 1 deletion egs_s2/LibriLight/local/test.sh

This file was deleted.

17 changes: 17 additions & 0 deletions egs_s2/LibriLight/local/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
# test with test set

config_path=$1
train_output_path=$2
ckpt_name=$3
root_dir=$4
dump_dir=$5

python3 ${BIN_DIR}/test.py \
--config_file=${config_path} \
--ckpt_path=${root_dir}/${train_output_path}/checkpoint/${ckpt_name} \
--test_semantic_path=${root_dir}/${dump_dir}/small/test/semantic_token_0_3.tsv \
--test_acoustic_path=${root_dir}/${dump_dir}/small/test/acoustic_token/hificodec_0_3.pth \
--output_dir=${root_dir}/${train_output_path}/test_output \
--hificodec_model_path=pretrained_model/hificodec/HiFi-Codec-16k-320d-large-universal \
--hificodec_config_path=pretrained_model/hificodec/config_16k_320d.json
16 changes: 10 additions & 6 deletions egs_s2/LibriLight/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@ log_frequency=$4
dist_url=$5
dump_dir=$6

python3 ${BIN_DIR}/train_large.py \
omp_num=8

# 注意 *_dirs 参数后面不可以有 ''='
OMP_NUM_THREADS=${omp_num} python3 ${BIN_DIR}/train_librilight_6k.py \
--config_file=${config_path} \
--train_semantic_path=${root_dir}/${dump_dir}/train/semantic_token.tsv \
--train_acoustic_path=${root_dir}/${dump_dir}/train/acoustic_token/hificodec.pth \
--dev_semantic_path=${root_dir}/${dump_dir}/dev/semantic_token.tsv \
--dev_acoustic_path=${root_dir}/${dump_dir}/dev/acoustic_token/hificodec.pth \
--train_semantic_dirs ''${root_dir}'/'${dump_dir}'/small/train/' ''${root_dir}'/'${dump_dir}'/medium/train/' \
--train_acoustic_dirs ''${root_dir}'/'${dump_dir}'/small/train/acoustic_token/' ''${root_dir}'/'${dump_dir}'/medium/train/acoustic_token/' \
--dev_semantic_dirs ''${root_dir}'/'${dump_dir}'/small/dev/' ''${root_dir}'/'${dump_dir}'/medium/dev/' \
--dev_acoustic_dirs ''${root_dir}'/'${dump_dir}'/small/dev/acoustic_token/' ''${root_dir}'/'${dump_dir}'/medium/dev/acoustic_token/' \
--output=${root_dir}/${train_output_path} \
--log_frequency=${log_frequency} \
--dist_url=${dist_url} \
--hificodec_model_path=pretrained_model/hificodec/HiFi-Codec-16k-320d-large-universal \
--hificodec_config_path=pretrained_model/hificodec/config_16k_320d.json
--hificodec_config_path=pretrained_model/hificodec/config_16k_320d.json \
--train_with_iter=True
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
#!/bin/bash
# run_base_L7_km300
# train LibriLight 6k (small + medium) by default
set -e

source path.sh

gpus=0,1,2,3
stage=0
stop_stage=100
train_output_path='exp_librilight/small_medium'
train_output_path='exp_librilight/default'
# dir to set part/all of dump dataset and experiment result
root_dir='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm'
# there should be *.wav 、*/*.wav or */*/*.wav in data_dir
data_dir='~/datasets/LibriLight'
config_path='conf/small_medium_iter.yaml'
config_path='conf/default.yaml'
log_frequency=1
# 'tcp://%s:%s' % (MASTER_ADDR, MASTER_PORT)
dist_url='tcp://127.0.0.1:29505'
# use which checkpoint file to test
ckpt_name='000301e_471119iter.pth'
ckpt_name='33000iter.pth'
# should be same with ${layer} in hubert_kms.sh
layer=7
# should be same with ${hubert_path} in hubert_kms.sh
Expand Down
5 changes: 3 additions & 2 deletions egs_s2/LibriTTS/conf/30k_lrx1_L7km300.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ model:
solver:
base_lr: 0.3e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
save_epochs: 1
dev_epochs: 1
ema:
Expand Down Expand Up @@ -74,7 +74,8 @@ solver:

dataloader:
max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
params:
Expand Down
5 changes: 3 additions & 2 deletions egs_s2/LibriTTS/conf/30k_lrx1_L9km500.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ model:
solver:
base_lr: 0.3e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
save_epochs: 1
dev_epochs: 1
ema:
Expand Down Expand Up @@ -74,7 +74,8 @@ solver:

dataloader:
max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
params:
Expand Down
5 changes: 3 additions & 2 deletions egs_s2/LibriTTS/conf/30k_lrx2_L10km1024.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ model:
solver:
base_lr: 0.6e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
save_epochs: 1
dev_epochs: 1
ema:
Expand Down Expand Up @@ -74,7 +74,8 @@ solver:

dataloader:
max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
params:
Expand Down
5 changes: 3 additions & 2 deletions egs_s2/LibriTTS/conf/30k_lrx2_L9km500.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ model:
solver:
base_lr: 0.6e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
save_epochs: 1
dev_epochs: 1
ema:
Expand Down Expand Up @@ -74,7 +74,8 @@ solver:

dataloader:
max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
params:
Expand Down
4 changes: 3 additions & 1 deletion egs_s2/LibriTTS/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ log_frequency=$4
dist_url=$5
dump_dir=$6

python3 ${BIN_DIR}/train.py \
omp_num=8

OMP_NUM_THREADS=${omp_num} python3 ${BIN_DIR}/train.py \
--config_file=${config_path} \
--train_semantic_path=${root_dir}/${dump_dir}/train/semantic_token.tsv \
--train_acoustic_path=${root_dir}/${dump_dir}/train/acoustic_token/hificodec.pth \
Expand Down
4 changes: 3 additions & 1 deletion egs_s2/LibriTTS/local/train_iter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ log_frequency=$4
dist_url=$5
dump_dir=$6

python3 ${BIN_DIR}/train.py \
omp_num=8

OMP_NUM_THREADS=${omp_num} python3 ${BIN_DIR}/train.py \
--config_file=${config_path} \
--train_semantic_path=${root_dir}/${dump_dir}/train/semantic_token.tsv \
--train_acoustic_path=${root_dir}/${dump_dir}/train/acoustic_token/hificodec.pth \
Expand Down
2 changes: 1 addition & 1 deletion egs_s2/LibriTTS/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,5 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh \
${config_path} ${train_output_path} ${ckpt_name} ${root_dir} \
${hubert_path} ${quantizer_path} ${prompt_wav_path} \
${S1_config_file} ${S1_ckpt_path} ${sil_token}|| exit -1
${S1_config_file} ${S1_ckpt_path} ${sil_token} || exit -1
fi
5 changes: 2 additions & 3 deletions egs_s2/LibriTTS/run_base_L7_km300.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ hubert_path=pretrained_model/hubert/hubert_base_ls960.pt
quantizer_path=pretrained_model/hubert/train-clean-360_hubert_base_ls960_L7_km300.bin
dump_dir=dump_libritts_universal_hificodec
# for synthesize_e2e.sh
prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_l
ibritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_libritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
S1_config_file='../../egs_s1/AR/LibriTTS/conf/base_L7bin300.yaml'
S1_ckpt_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/ar_s1/SoundStorm/exp/base_L7_km300/ckpt/epoch=99-step=49000.ckpt'
# 4 for 300 bin, you should modify this due to your own dum data
Expand Down Expand Up @@ -61,5 +60,5 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh \
${config_path} ${train_output_path} ${ckpt_name} ${root_dir} \
${hubert_path} ${quantizer_path} ${prompt_wav_path} \
${S1_config_file} ${S1_ckpt_path} ${sil_token}|| exit -1
${S1_config_file} ${S1_ckpt_path} ${sil_token} || exit -1
fi
5 changes: 2 additions & 3 deletions egs_s2/LibriTTS/run_base_L7_km300_iter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ hubert_path=pretrained_model/hubert/hubert_base_ls960.pt
quantizer_path=pretrained_model/hubert/train-clean-360_hubert_base_ls960_L7_km300.bin
dump_dir=dump_libritts_universal_hificodec
# for synthesize_e2e.sh
prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_l
ibritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_libritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
S1_config_file='../../egs_s1/AR/LibriTTS/conf/base_L7bin300.yaml'
S1_ckpt_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/ar_s1/SoundStorm/exp/base_L7_km300/ckpt/epoch=99-step=49000.ckpt'
# 4 for 300 bin, you should modify this due to your own dum data
Expand Down Expand Up @@ -61,5 +60,5 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh \
${config_path} ${train_output_path} ${ckpt_name} ${root_dir} \
${hubert_path} ${quantizer_path} ${prompt_wav_path} \
${S1_config_file} ${S1_ckpt_path} ${sil_token}|| exit -1
${S1_config_file} ${S1_ckpt_path} ${sil_token} || exit -1
fi
Loading

0 comments on commit 71d6a3c

Please sign in to comment.