yangdongchao · yangdongchao · Aug 11, 2023 · Aug 10, 2023 · Aug 10, 2023 · Aug 10, 2023
diff --git a/egs_s2/LJSpeech/conf/default.yaml b/egs_s2/LJSpeech/conf/default.yaml
@@ -73,7 +73,8 @@ solver:
 
 dataloader:
     max_token_one_batch: 96000 # 影响单卡显存占用, 96k for 80G GPU (A100) (LJSpeech)
-    num_workers: 2
+    num_workers: 4
+    prefetch_factor: 50
     train_datasets: # a list of configures, so we can combine several schedulers
         - target: soundstorm.s2.data.semantic_dataset.SemanticDataset
           params:

diff --git a/egs_s2/LibriLight/conf/default.yaml b/egs_s2/LibriLight/conf/default.yaml
@@ -1,3 +1,4 @@
+# config for LibriLight 6k (small + medium)
 # 30k_basex1_hubert_L7km300
 model:
     target: soundstorm.s2.models.dalle_wav.dalle_wav.DALLE
@@ -27,6 +28,10 @@ model:
                         timestep_type: 'adalayernorm'   # adainsnorm or adalayernorm and abs
                         mlp_hidden_times: 4
                         semantic_token_nums: 300
+                        prompt_semantic_emb_len: 10 # should > max_prompt_sec in dataset
+                        target_semantic_emb_len: 30 # should > max_target_sec in dataset
+                        prompt_acoustic_emb_len: 10 # can be same with prompt_semantic
+                        target_acoustic_emb_len: 30 # can be same with target_semantic
                         content_emb_config:
                             target: soundstorm.s2.models.dalle_wav.mask_embedding.DalleMaskImageEmbedding
                             params:
@@ -37,11 +42,11 @@ model:
                                 pos_emb_type: embedding
 
 solver:
-    base_lr: 0.3e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
-    adjust_lr: none # not adjust lr according to total batch_size
-    max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
-    save_epochs: 1
-    dev_epochs: 1
+    base_lr: 0.3e-05  # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
+    adjust_lr: none   # not adjust lr according to total batch_size
+    max_iters: 550000 # 55w iter for 8 GPU for small + medium, ~70 epochs, ~7.8k iter/epoch, training 2.5h/epoch
+    save_iters: 1500  # 1.5k, ~ cost 0.3h to save a ckpt
+    dev_iters: 1500   # num of iter for each gpu, for 8 gpu here, should x2 when use 4 gpus to make model see same number of samples
     ema:
         decay: 0.99
         update_interval: 25
@@ -69,25 +74,30 @@ solver:
                 threshold: 1.0e-1
                 threshold_mode: rel
                 warmup_lr: 0.45e-3   # the lr to be touched after warmup
-                warmup: 800  # ~ 2 epoch
+                warmup: 800  # num of iter to warmup
 
 dataloader:
     max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
-    num_workers: 2
+    num_workers: 4
+    prefetch_factor: 50
     train_datasets: # a list of configures, so we can combine several schedulers
-        - target: soundstorm.s2.data.semantic_dataset.SemanticDataset
+        - target: soundstorm.s2.data.semantic_dataset_librilight_6k.SemanticDataset
           params:
                 codec_name: hificodec
                 num_quant: 4    # not work when != 4 for hificodec, and can be 3 for soundstream and encodec
-                semantic_token_nums: 300 # 1000 for mhubert 500 for en_hubert 
-                semantic_path: dump/train/semantic_token.tsv
-                acoustic_path: dump/train/acoustic_token/hificodec.pth
+                semantic_token_nums: 300 # same with num of kmeans bins
+                max_prompt_sec: 3        # be same with LibriTTS
+                max_target_sec: 20       # LibriTTS is 10, use 20 here for longer TTS
+                semantic_dirs: ['dump/small/train/']
+                acoustic_dirs: ['dump/small/train/acoustic_token/']
 
     dev_datasets:
-        - target: soundstorm.s2.data.semantic_dataset.SemanticDataset
+        - target: soundstorm.s2.data.semantic_dataset_librilight_6k.SemanticDataset
           params:
                 codec_name: hificodec
                 num_quant: 4    # not work when != 4 for hificodec, and can be 3 for soundstream and encodec
-                semantic_token_nums: 300 # 1000 for mhubert 500 for en_hubert 
-                semantic_path: dump/dev/semantic_token.tsv
-                acoustic_path: dump/dev/acoustic_token/hificodec.pth
+                semantic_token_nums: 300 # same with num of kmeans bins
+                max_prompt_sec: 3        # be same with LibriTTS
+                max_target_sec: 20       # LibriTTS is 10, use 20 here for longer TTS
+                semantic_dirs: ['dump/small/dev/']
+                acoustic_dirs: ['dump/small/dev/acoustic_token/']
diff --git a/egs_s2/LibriLight/conf/small_medium_iter.yaml b/egs_s2/LibriLight/conf/small_medium_iter.yaml
diff --git a/egs_s2/LibriLight/local/test.sh b/egs_s2/LibriLight/local/test.sh
diff --git a/egs_s2/LibriLight/local/test.sh b/egs_s2/LibriLight/local/test.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+# test with test set
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+root_dir=$4
+dump_dir=$5
+
+python3 ${BIN_DIR}/test.py \
+        --config_file=${config_path} \
+        --ckpt_path=${root_dir}/${train_output_path}/checkpoint/${ckpt_name} \
+        --test_semantic_path=${root_dir}/${dump_dir}/small/test/semantic_token_0_3.tsv \
+        --test_acoustic_path=${root_dir}/${dump_dir}/small/test/acoustic_token/hificodec_0_3.pth \
+        --output_dir=${root_dir}/${train_output_path}/test_output \
+        --hificodec_model_path=pretrained_model/hificodec/HiFi-Codec-16k-320d-large-universal \
+        --hificodec_config_path=pretrained_model/hificodec/config_16k_320d.json
diff --git a/egs_s2/LibriLight/local/train.sh b/egs_s2/LibriLight/local/train.sh
@@ -7,14 +7,18 @@ log_frequency=$4
 dist_url=$5
 dump_dir=$6
 
-python3 ${BIN_DIR}/train_large.py \
+omp_num=8
+
+# 注意 *_dirs 参数后面不可以有 ''='
+OMP_NUM_THREADS=${omp_num} python3 ${BIN_DIR}/train_librilight_6k.py \
         --config_file=${config_path} \
-        --train_semantic_path=${root_dir}/${dump_dir}/train/semantic_token.tsv \
-        --train_acoustic_path=${root_dir}/${dump_dir}/train/acoustic_token/hificodec.pth \
-        --dev_semantic_path=${root_dir}/${dump_dir}/dev/semantic_token.tsv \
-        --dev_acoustic_path=${root_dir}/${dump_dir}/dev/acoustic_token/hificodec.pth \
+        --train_semantic_dirs ''${root_dir}'/'${dump_dir}'/small/train/' ''${root_dir}'/'${dump_dir}'/medium/train/' \
+        --train_acoustic_dirs ''${root_dir}'/'${dump_dir}'/small/train/acoustic_token/' ''${root_dir}'/'${dump_dir}'/medium/train/acoustic_token/' \
+        --dev_semantic_dirs ''${root_dir}'/'${dump_dir}'/small/dev/' ''${root_dir}'/'${dump_dir}'/medium/dev/' \
+        --dev_acoustic_dirs ''${root_dir}'/'${dump_dir}'/small/dev/acoustic_token/' ''${root_dir}'/'${dump_dir}'/medium/dev/acoustic_token/' \
         --output=${root_dir}/${train_output_path} \
         --log_frequency=${log_frequency} \
         --dist_url=${dist_url} \
         --hificodec_model_path=pretrained_model/hificodec/HiFi-Codec-16k-320d-large-universal \
-        --hificodec_config_path=pretrained_model/hificodec/config_16k_320d.json
+        --hificodec_config_path=pretrained_model/hificodec/config_16k_320d.json \
+        --train_with_iter=True
diff --git a/egs_s2/LibriLight/run_small_medium.sh → egs_s2/LibriLight/run.sh b/egs_s2/LibriLight/run_small_medium.sh → egs_s2/LibriLight/run.sh
@@ -1,23 +1,24 @@
 #!/bin/bash
 # run_base_L7_km300
+# train LibriLight 6k (small + medium) by default
 set -e
 
 source path.sh
 
 gpus=0,1,2,3
 stage=0
 stop_stage=100
-train_output_path='exp_librilight/small_medium'
+train_output_path='exp_librilight/default'
 # dir to set part/all of dump dataset and experiment result
 root_dir='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm'
 # there should be *.wav 、*/*.wav or */*/*.wav in data_dir
 data_dir='~/datasets/LibriLight'
-config_path='conf/small_medium_iter.yaml'
+config_path='conf/default.yaml'
 log_frequency=1
 # 'tcp://%s:%s' % (MASTER_ADDR, MASTER_PORT)
 dist_url='tcp://127.0.0.1:29505'
 # use which checkpoint file to test
-ckpt_name='000301e_471119iter.pth'
+ckpt_name='33000iter.pth'
 # should be same with ${layer} in hubert_kms.sh
 layer=7
 # should be same with ${hubert_path} in hubert_kms.sh

diff --git a/egs_s2/LibriTTS/conf/30k_lrx1_L7km300.yaml b/egs_s2/LibriTTS/conf/30k_lrx1_L7km300.yaml
@@ -40,7 +40,7 @@ model:
 solver:
     base_lr: 0.3e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
     adjust_lr: none # not adjust lr according to total batch_size
-    max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
+    max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
     save_epochs: 1
     dev_epochs: 1
     ema:
@@ -74,7 +74,8 @@ solver:
 
 dataloader:
     max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
-    num_workers: 2
+    num_workers: 4
+    prefetch_factor: 50
     train_datasets: # a list of configures, so we can combine several schedulers
         - target: soundstorm.s2.data.semantic_dataset.SemanticDataset
           params:

diff --git a/egs_s2/LibriTTS/conf/30k_lrx1_L9km500.yaml b/egs_s2/LibriTTS/conf/30k_lrx1_L9km500.yaml
@@ -40,7 +40,7 @@ model:
 solver:
     base_lr: 0.3e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
     adjust_lr: none # not adjust lr according to total batch_size
-    max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
+    max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
     save_epochs: 1
     dev_epochs: 1
     ema:
@@ -74,7 +74,8 @@ solver:
 
 dataloader:
     max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
-    num_workers: 2
+    num_workers: 4
+    prefetch_factor: 50
     train_datasets: # a list of configures, so we can combine several schedulers
         - target: soundstorm.s2.data.semantic_dataset.SemanticDataset
           params:

diff --git a/egs_s2/LibriTTS/conf/30k_lrx2_L10km1024.yaml b/egs_s2/LibriTTS/conf/30k_lrx2_L10km1024.yaml
@@ -40,7 +40,7 @@ model:
 solver:
     base_lr: 0.6e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
     adjust_lr: none # not adjust lr according to total batch_size
-    max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
+    max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
     save_epochs: 1
     dev_epochs: 1
     ema:
@@ -74,7 +74,8 @@ solver:
 
 dataloader:
     max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
-    num_workers: 2
+    num_workers: 4
+    prefetch_factor: 50
     train_datasets: # a list of configures, so we can combine several schedulers
         - target: soundstorm.s2.data.semantic_dataset.SemanticDataset
           params:

diff --git a/egs_s2/LibriTTS/conf/30k_lrx2_L9km500.yaml b/egs_s2/LibriTTS/conf/30k_lrx2_L9km500.yaml
@@ -40,7 +40,7 @@ model:
 solver:
     base_lr: 0.6e-05 # 3.0e-6 x 8 cause max_token_one_batch is x 8 (the old is 10k)
     adjust_lr: none # not adjust lr according to total batch_size
-    max_epochs: 500 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
+    max_epochs: 400 # 400 for LibriTTS (train-clean-100 + train-clean-360) 9.2k epoch for LJspeech
     save_epochs: 1
     dev_epochs: 1
     ema:
@@ -74,7 +74,8 @@ solver:
 
 dataloader:
     max_token_one_batch: 30000 # 影响单卡显存占用, 81k for 80G GPU (A100) (LibriTTS)
-    num_workers: 2
+    num_workers: 4
+    prefetch_factor: 50
     train_datasets: # a list of configures, so we can combine several schedulers
         - target: soundstorm.s2.data.semantic_dataset.SemanticDataset
           params:

diff --git a/egs_s2/LibriTTS/local/train.sh b/egs_s2/LibriTTS/local/train.sh
@@ -7,7 +7,9 @@ log_frequency=$4
 dist_url=$5
 dump_dir=$6
 
-python3 ${BIN_DIR}/train.py \
+omp_num=8
+
+OMP_NUM_THREADS=${omp_num} python3 ${BIN_DIR}/train.py \
         --config_file=${config_path} \
         --train_semantic_path=${root_dir}/${dump_dir}/train/semantic_token.tsv \
         --train_acoustic_path=${root_dir}/${dump_dir}/train/acoustic_token/hificodec.pth \

diff --git a/egs_s2/LibriTTS/local/train_iter.sh b/egs_s2/LibriTTS/local/train_iter.sh
@@ -7,7 +7,9 @@ log_frequency=$4
 dist_url=$5
 dump_dir=$6
 
-python3 ${BIN_DIR}/train.py \
+omp_num=8
+
+OMP_NUM_THREADS=${omp_num} python3 ${BIN_DIR}/train.py \
         --config_file=${config_path} \
         --train_semantic_path=${root_dir}/${dump_dir}/train/semantic_token.tsv \
         --train_acoustic_path=${root_dir}/${dump_dir}/train/acoustic_token/hificodec.pth \

diff --git a/egs_s2/LibriTTS/run.sh b/egs_s2/LibriTTS/run.sh
@@ -61,5 +61,5 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh \
     ${config_path} ${train_output_path} ${ckpt_name} ${root_dir} \
     ${hubert_path} ${quantizer_path} ${prompt_wav_path} \
-    ${S1_config_file} ${S1_ckpt_path} ${sil_token}|| exit -1
+    ${S1_config_file} ${S1_ckpt_path} ${sil_token} || exit -1
 fi
diff --git a/egs_s2/LibriTTS/run_base_L7_km300.sh b/egs_s2/LibriTTS/run_base_L7_km300.sh
@@ -24,8 +24,7 @@ hubert_path=pretrained_model/hubert/hubert_base_ls960.pt
 quantizer_path=pretrained_model/hubert/train-clean-360_hubert_base_ls960_L7_km300.bin
 dump_dir=dump_libritts_universal_hificodec
 # for synthesize_e2e.sh
-prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_l
-ibritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
+prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_libritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
 S1_config_file='../../egs_s1/AR/LibriTTS/conf/base_L7bin300.yaml'
 S1_ckpt_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/ar_s1/SoundStorm/exp/base_L7_km300/ckpt/epoch=99-step=49000.ckpt'
 # 4 for 300 bin, you should modify this due to your own dum data
@@ -61,5 +60,5 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh \
     ${config_path} ${train_output_path} ${ckpt_name} ${root_dir} \
     ${hubert_path} ${quantizer_path} ${prompt_wav_path} \
-    ${S1_config_file} ${S1_ckpt_path} ${sil_token}|| exit -1
+    ${S1_config_file} ${S1_ckpt_path} ${sil_token} || exit -1
 fi
diff --git a/egs_s2/LibriTTS/run_base_L7_km300_iter.sh b/egs_s2/LibriTTS/run_base_L7_km300_iter.sh
@@ -24,8 +24,7 @@ hubert_path=pretrained_model/hubert/hubert_base_ls960.pt
 quantizer_path=pretrained_model/hubert/train-clean-360_hubert_base_ls960_L7_km300.bin
 dump_dir=dump_libritts_universal_hificodec
 # for synthesize_e2e.sh
-prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_l
-ibritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
+prompt_wav_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/SoundStorm/dump_libritts_base_L9_km500/test/synthesize_input/1006_135212_000060_000004.wav'
 S1_config_file='../../egs_s1/AR/LibriTTS/conf/base_L7bin300.yaml'
 S1_ckpt_path='/nfs-speech-cpfs/dev/yuantian04/Vivid_TTS/SoundStorm/SoundStorm/ar_s1/SoundStorm/exp/base_L7_km300/ckpt/epoch=99-step=49000.ckpt'
 # 4 for 300 bin, you should modify this due to your own dum data
@@ -61,5 +60,5 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh \
     ${config_path} ${train_output_path} ${ckpt_name} ${root_dir} \
     ${hubert_path} ${quantizer_path} ${prompt_wav_path} \
-    ${S1_config_file} ${S1_ckpt_path} ${sil_token}|| exit -1
+    ${S1_config_file} ${S1_ckpt_path} ${sil_token} || exit -1
 fi