Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add dataloader for LibriLight 6k #34

Merged
merged 8 commits into from
Aug 11, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion egs_s2/LJSpeech/conf/default.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ solver:

dataloader:
max_token_one_batch: 96000 # 影响单卡显存占用, 96k for 80G GPU (A100) (LJSpeech)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
params:
Expand Down
40 changes: 25 additions & 15 deletions egs_s2/LibriLight/conf/default.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# config for LibriLight 6k (small + medium)
# 30k_basex1_hubert_L7km300
model:
target: soundstorm.s2.models.dalle_wav.dalle_wav.DALLE
Expand Down Expand Up @@ -27,6 +28,10 @@ model:
timestep_type: 'adalayernorm' # adainsnorm or adalayernorm and abs
mlp_hidden_times: 4
semantic_token_nums: 300
prompt_semantic_emb_len: 10 # should > max_prompt_sec in dataset
target_semantic_emb_len: 30 # should > max_target_sec in dataset
prompt_acoustic_emb_len: 10 # can be same with prompt_semantic
target_acoustic_emb_len: 30 # can be same with target_semantic
content_emb_config:
target: soundstorm.s2.models.dalle_wav.mask_embedding.DalleMaskImageEmbedding
params:
Expand All @@ -37,11 +42,11 @@ model:
pos_emb_type: embedding

solver:
base_lr: 0.3e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
save_epochs: 1
dev_epochs: 1
base_lr: 0.3e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_iters: 550000 # 55w iter for 8 GPU for small + medium, ~70 epochs, ~7.8k iter/epoch, training 2.5h/epoch
save_iters: 1500 # 1.5k, ~ cost 0.3h to save a ckpt
dev_iters: 1500 # num of iter for each gpu, for 8 gpu here, should x2 when use 4 gpus to make model see same number of samples
ema:
decay: 0.99
update_interval: 25
Expand Down Expand Up @@ -69,25 +74,30 @@ solver:
threshold: 1.0e-1
threshold_mode: rel
warmup_lr: 0.45e-3 # the lr to be touched after warmup
warmup: 800 # ~ 2 epoch
warmup: 800 # num of iter to warmup

dataloader:
max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
- target: soundstorm.s2.data.semantic_dataset_librilight_6k.SemanticDataset
params:
codec_name: hificodec
num_quant: 4 # not work when != 4 for hificodec, and can be 3 for soundstream and encodec
semantic_token_nums: 300 # 1000 for mhubert 500 for en_hubert
semantic_path: dump/train/semantic_token.tsv
acoustic_path: dump/train/acoustic_token/hificodec.pth
semantic_token_nums: 300 # same with num of kmeans bins
max_prompt_sec: 3 # be same with LibriTTS
max_target_sec: 20 # LibriTTS is 10, use 20 here for longer TTS
semantic_dirs: ['dump/small/train/']
acoustic_dirs: ['dump/small/train/acoustic_token/']

dev_datasets:
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
- target: soundstorm.s2.data.semantic_dataset_librilight_6k.SemanticDataset
params:
codec_name: hificodec
num_quant: 4 # not work when != 4 for hificodec, and can be 3 for soundstream and encodec
semantic_token_nums: 300 # 1000 for mhubert 500 for en_hubert
semantic_path: dump/dev/semantic_token.tsv
acoustic_path: dump/dev/acoustic_token/hificodec.pth
semantic_token_nums: 300 # same with num of kmeans bins
max_prompt_sec: 3 # be same with LibriTTS
max_target_sec: 20 # LibriTTS is 10, use 20 here for longer TTS
semantic_dirs: ['dump/small/dev/']
acoustic_dirs: ['dump/small/dev/acoustic_token/']
93 changes: 0 additions & 93 deletions egs_s2/LibriLight/conf/small_medium_iter.yaml

This file was deleted.

1 change: 0 additions & 1 deletion egs_s2/LibriLight/local/test.sh

This file was deleted.

17 changes: 17 additions & 0 deletions egs_s2/LibriLight/local/test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/bin/bash
# test with test set

config_path=$1
train_output_path=$2
ckpt_name=$3
root_dir=$4
dump_dir=$5

python3 ${BIN_DIR}/test.py \
--config_file=${config_path} \
--ckpt_path=${root_dir}/${train_output_path}/checkpoint/${ckpt_name} \
--test_semantic_path=${root_dir}/${dump_dir}/small/test/semantic_token_0_3.tsv \
--test_acoustic_path=${root_dir}/${dump_dir}/small/test/acoustic_token/hificodec_0_3.pth \
--output_dir=${root_dir}/${train_output_path}/test_output \
--hificodec_model_path=pretrained_model/hificodec/HiFi-Codec-16k-320d-large-universal \
--hificodec_config_path=pretrained_model/hificodec/config_16k_320d.json
16 changes: 10 additions & 6 deletions egs_s2/LibriLight/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,18 @@ log_frequency=$4
dist_url=$5
dump_dir=$6

python3 ${BIN_DIR}/train_large.py \
omp_num=8

# 注意 *_dirs 参数后面不可以有 ''='
OMP_NUM_THREADS=${omp_num} python3 ${BIN_DIR}/train_librilight_6k.py \
--config_file=${config_path} \
--train_semantic_path=${root_dir}/${dump_dir}/train/semantic_token.tsv \
--train_acoustic_path=${root_dir}/${dump_dir}/train/acoustic_token/hificodec.pth \
--dev_semantic_path=${root_dir}/${dump_dir}/dev/semantic_token.tsv \
--dev_acoustic_path=${root_dir}/${dump_dir}/dev/acoustic_token/hificodec.pth \
--train_semantic_dirs ''${root_dir}'/'${dump_dir}'/small/train/' ''${root_dir}'/'${dump_dir}'/medium/train/' \
--train_acoustic_dirs ''${root_dir}'/'${dump_dir}'/small/train/acoustic_token/' ''${root_dir}'/'${dump_dir}'/medium/train/acoustic_token/' \
--dev_semantic_dirs ''${root_dir}'/'${dump_dir}'/small/dev/' ''${root_dir}'/'${dump_dir}'/medium/dev/' \
--dev_acoustic_dirs ''${root_dir}'/'${dump_dir}'/small/dev/acoustic_token/' ''${root_dir}'/'${dump_dir}'/medium/dev/acoustic_token/' \
--output=${root_dir}/${train_output_path} \
--log_frequency=${log_frequency} \
--dist_url=${dist_url} \
--hificodec_model_path=pretrained_model/hificodec/HiFi-Codec-16k-320d-large-universal \
--hificodec_config_path=pretrained_model/hificodec/config_16k_320d.json
--hificodec_config_path=pretrained_model/hificodec/config_16k_320d.json \
--train_with_iter=True
Original file line number Diff line number Diff line change
@@ -1,23 +1,24 @@
#!/bin/bash
# run_base_L7_km300
# train LibriLight 6k (small + medium) by default
set -e

source path.sh

gpus=0,1,2,3
stage=0
stop_stage=100
train_output_path='exp_librilight/small_medium'
train_output_path='exp_librilight/default'
# dir to set part/all of dump dataset and experiment result
root_dir='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm'
# there should be *.wav 、*/*.wav or */*/*.wav in data_dir
data_dir='~/datasets/LibriLight'
config_path='conf/small_medium_iter.yaml'
config_path='conf/default.yaml'
log_frequency=1
# 'tcp://%s:%s' % (MASTER_ADDR, MASTER_PORT)
dist_url='tcp://127.0.0.1:29505'
# use which checkpoint file to test
ckpt_name='000301e_471119iter.pth'
ckpt_name='33000iter.pth'
# should be same with ${layer} in hubert_kms.sh
layer=7
# should be same with ${hubert_path} in hubert_kms.sh
Expand Down
5 changes: 3 additions & 2 deletions egs_s2/LibriTTS/conf/30k_lrx1_L7km300.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ model:
solver:
base_lr: 0.3e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
save_epochs: 1
dev_epochs: 1
ema:
Expand Down Expand Up @@ -74,7 +74,8 @@ solver:

dataloader:
max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
params:
Expand Down
5 changes: 3 additions & 2 deletions egs_s2/LibriTTS/conf/30k_lrx1_L9km500.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ model:
solver:
base_lr: 0.3e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
save_epochs: 1
dev_epochs: 1
ema:
Expand Down Expand Up @@ -74,7 +74,8 @@ solver:

dataloader:
max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
params:
Expand Down
5 changes: 3 additions & 2 deletions egs_s2/LibriTTS/conf/30k_lrx2_L10km1024.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ model:
solver:
base_lr: 0.6e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
save_epochs: 1
dev_epochs: 1
ema:
Expand Down Expand Up @@ -74,7 +74,8 @@ solver:

dataloader:
max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
params:
Expand Down
5 changes: 3 additions & 2 deletions egs_s2/LibriTTS/conf/30k_lrx2_L9km500.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ model:
solver:
base_lr: 0.6e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
adjust_lr: none # not adjust lr according to total batch_size
max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
save_epochs: 1
dev_epochs: 1
ema:
Expand Down Expand Up @@ -74,7 +74,8 @@ solver:

dataloader:
max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
num_workers: 2
num_workers: 4
prefetch_factor: 50
train_datasets: # a list of configures, so we can combine several schedulers
- target: soundstorm.s2.data.semantic_dataset.SemanticDataset
params:
Expand Down
4 changes: 3 additions & 1 deletion egs_s2/LibriTTS/local/train.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ log_frequency=$4
dist_url=$5
dump_dir=$6

python3 ${BIN_DIR}/train.py \
omp_num=8

OMP_NUM_THREADS=${omp_num} python3 ${BIN_DIR}/train.py \
--config_file=${config_path} \
--train_semantic_path=${root_dir}/${dump_dir}/train/semantic_token.tsv \
--train_acoustic_path=${root_dir}/${dump_dir}/train/acoustic_token/hificodec.pth \
Expand Down
4 changes: 3 additions & 1 deletion egs_s2/LibriTTS/local/train_iter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ log_frequency=$4
dist_url=$5
dump_dir=$6

python3 ${BIN_DIR}/train.py \
omp_num=8

OMP_NUM_THREADS=${omp_num} python3 ${BIN_DIR}/train.py \
--config_file=${config_path} \
--train_semantic_path=${root_dir}/${dump_dir}/train/semantic_token.tsv \
--train_acoustic_path=${root_dir}/${dump_dir}/train/acoustic_token/hificodec.pth \
Expand Down
2 changes: 1 addition & 1 deletion egs_s2/LibriTTS/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,5 +61,5 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh \
${config_path} ${train_output_path} ${ckpt_name} ${root_dir} \
${hubert_path} ${quantizer_path} ${prompt_wav_path} \
${S1_config_file} ${S1_ckpt_path} ${sil_token}|| exit -1
${S1_config_file} ${S1_ckpt_path} ${sil_token} || exit -1
fi
5 changes: 2 additions & 3 deletions egs_s2/LibriTTS/run_base_L7_km300.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ hubert_path=pretrained_model/hubert/hubert_base_ls960.pt
quantizer_path=pretrained_model/hubert/train-clean-360_hubert_base_ls960_L7_km300.bin
dump_dir=dump_libritts_universal_hificodec
# for synthesize_e2e.sh
prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_l
ibritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_libritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
S1_config_file='../../egs_s1/AR/LibriTTS/conf/base_L7bin300.yaml'
S1_ckpt_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/ar_s1/SoundStorm/exp/base_L7_km300/ckpt/epoch=99-step=49000.ckpt'
# 4 for 300 bin, you should modify this due to your own dum data
Expand Down Expand Up @@ -61,5 +60,5 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh \
${config_path} ${train_output_path} ${ckpt_name} ${root_dir} \
${hubert_path} ${quantizer_path} ${prompt_wav_path} \
${S1_config_file} ${S1_ckpt_path} ${sil_token}|| exit -1
${S1_config_file} ${S1_ckpt_path} ${sil_token} || exit -1
fi
5 changes: 2 additions & 3 deletions egs_s2/LibriTTS/run_base_L7_km300_iter.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,7 @@ hubert_path=pretrained_model/hubert/hubert_base_ls960.pt
quantizer_path=pretrained_model/hubert/train-clean-360_hubert_base_ls960_L7_km300.bin
dump_dir=dump_libritts_universal_hificodec
# for synthesize_e2e.sh
prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_l
ibritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_libritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
S1_config_file='../../egs_s1/AR/LibriTTS/conf/base_L7bin300.yaml'
S1_ckpt_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/ar_s1/SoundStorm/exp/base_L7_km300/ckpt/epoch=99-step=49000.ckpt'
# 4 for 300 bin, you should modify this due to your own dum data
Expand Down Expand Up @@ -61,5 +60,5 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh \
${config_path} ${train_output_path} ${ckpt_name} ${root_dir} \
${hubert_path} ${quantizer_path} ${prompt_wav_path} \
${S1_config_file} ${S1_ckpt_path} ${sil_token}|| exit -1
${S1_config_file} ${S1_ckpt_path} ${sil_token} || exit -1
fi
Loading