<a href="https://colab.research.google.com/github/usamireko/DiffSinger_colab_notebook_MLo7/blob/Stable/PC_NSF_NSF_hifigan_finetuning_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Setup**

In [None]:
#@title # Mount Google Drive and Setup with Python 3.10
%cd /content
pc_nsf = True  # @param {"type":"boolean"}

from IPython.display import clear_output
import os
import shutil
import yaml
from google.colab import drive

# Mount Drive
drive.mount("/content/drive")

# Install Python 3.10
!sudo apt update -y
!sudo apt install -y python3.10 python3.10-distutils python3.10-venv
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 1
!sudo update-alternatives --set python3 /usr/bin/python3.10
!curl -sS https://bootstrap.pypa.io/get-pip.py | python3

# Install UV
!python3 -m pip install -U uv

# Add UV to PATH
os.environ["PATH"] += ":/root/.cargo/bin"

# Verify versions
!python3 --version
!uv --version

# Clone Repos
!rm -rf /content/sample_data
!git clone https://github.com/openvpi/DiffSinger.git
!git clone https://github.com/openvpi/SingingVocoders

# Install dependencies
!uv pip install torch torchaudio torchvision --index-url https://download.pytorch.org/whl/cu121
!uv pip install click einops h5py librosa lightning matplotlib mido numpy praat-parselmouth preprocessing pyworld PyYAML torchmetrics tqdm tensorboard tensorboardX

# Download pretrained models
!apt-get install -y aria2
if pc_nsf:
  !aria2c https://github.com/openvpi/SingingVocoders/releases/download/v1.0.0/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip
  !mkdir -p /content/SingingVocoders/pretrained
  !7z x "/content/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip" -o/content/SingingVocoders/pretrained
  !rm "/content/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.zip"
else:
  !aria2c https://github.com/openvpi/SingingVocoders/releases/download/v0.0.2/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip
  !mkdir -p /content/SingingVocoders/pretrained
  !7z x "/content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip" -o/content/SingingVocoders/pretrained
  !rm "/content/nsf_hifigan_44.1k_hop512_128bin_2024.02.zip"

#incase theyll add it in the future
#!aria2c https://github.com/openvpi/DiffSinger/releases/download/v2.1.0/rmvpe.zip
#!7z x /content/rmvpe.zip -o/content/SingingVocoders/pretrained
#!rm /content/rmvpe.zip
clear_output()
print("✔️ Setup complete")


# **Preprocess data for training**

In [None]:
import re
import soundfile as sf
import librosa
import os
import numpy as np
import concurrent.futures

# Path to zip file containing your audio data
data_zip_path = ""  # @param {type:"string"}

# Segment interval in seconds
segment_interval = 10  # @param {type:"slider", min:2, max:60, step:1}

train_path = "/content/audio_data/input"
npz_path = "/content/audio_data/output"

# Clean up and create directories if needed
!rm -rf /content/audio_data >/dev/null 2>&1

if not os.path.exists(train_path):
    os.makedirs(train_path)
    os.makedirs(npz_path)

# Extract WAV files from the zip archive
!7z e "$data_zip_path" -o{train_path} "*.wav" -r

# Function to resample and segment audio
def resample_and_convert_audio(audio_path, sample_rate=44100):
    audio, sr = librosa.load(audio_path, sr=None)
    duration = librosa.get_duration(y=audio, sr=sr)

    if sr != sample_rate:
        audio = librosa.resample(y=audio, orig_sr=sr, target_sr=sample_rate)

    if duration > segment_interval:
        samples_per_segment = segment_interval * sample_rate
        total_segments = int(np.ceil(duration / segment_interval))

        for segment in range(total_segments):
            start_sample = samples_per_segment * segment
            end_sample = start_sample + samples_per_segment
            if end_sample > len(audio):
                end_sample = len(audio)
            segment_audio = audio[start_sample:end_sample]

            segment_filename = f"{os.path.splitext(os.path.basename(audio_path))[0]}_segment_{segment}.wav"
            segment_path = os.path.join(os.path.dirname(audio_path), segment_filename)
            sf.write(segment_path, segment_audio, sample_rate)
        print(f"Resampled {os.path.basename(audio_path)} to {sample_rate} Hz.")
        print(f"Segmented {os.path.basename(audio_path)} into {total_segments} parts.")
    else:
        sf.write(audio_path, audio, sample_rate)
        print(f"Resampled {os.path.basename(audio_path)} to {sample_rate} Hz.")


def main():
    audio_files = []
    for root, dirs, files in os.walk(train_path):
        for file in files:
            if file.endswith(".wav"):
                audio_files.append(os.path.join(root, file))


    with concurrent.futures.ProcessPoolExecutor() as executor:
        executor.map(resample_and_convert_audio, audio_files)

if __name__ == '__main__':
    main()

In [None]:
#@title # Edit Config
#@markdown ___

import yaml
import os
import torch
#@markdown Model's name and save path
exp_name = "" # @param {type:"string"}
save_path = "" # @param {type:"string"}


#@markdown Pitch extractor algorithm
f0_ext = "parselmouth" # @param ["parselmouth", "harvest"]
f0_min = 40 # @param {type:"slider", min:0, max:250, step:2}
f0_max = 1200 # @param {type:"slider", min:800, max:4180, step:20}


#@markdown Precision option
precision = "16-mixed" # @param ["32-true", "bf16-mixed", "16-mixed"]

#@markdown Data aug option
data_aug = True # @param {type:"boolean"}
data_aug_probability = 0.5 # @param {type:"slider", min:0.1, max:3, step:0.1}

#@markdown Amount of validation files you want to use (can't exceed the amount of train files)
val_amount = 6 # @param {type:"slider", min:1, max:18, step:1}

#@markdown Path to the base model for fine tuning | leave blank to use the default ckpt
finetune_ckpt_path = "" # @param {type:"string"}



if finetune_ckpt_path:
    finetune_ckpt = finetune_ckpt_path
else:
  if pc_nsf:
    finetune_ckpt = "/content/SingingVocoders/pretrained/pc_nsf_hifigan_44.1k_hop512_128bin_2025.02.ckpt"
  else:
    finetune_ckpt = "/content/SingingVocoders/pretrained/nsf_hifigan_44.1k_hop512_128bin_2024.02.ckpt"



#@markdown Learning rate of discriminater and generater model
learning_rate = 0.00001 # @param {type:"slider", min:0.00001, max:0.0005, step:0.00001}

with open("/content/SingingVocoders/configs/ft_hifigan.yaml", "r") as config:
    ew = yaml.safe_load(config)
ew["data_input_path"] = ["/content/audio_data/input"]
ew["data_out_path"] = [save_path + "/data"]
ew["val_num"] = val_amount
ew["pe"] = f0_ext
ew["f0_min"] = f0_min
ew["f0_max"] = f0_max
ew["DataIndexPath"] = save_path
ew["finetune_ckpt_path"] = finetune_ckpt
ew["discriminate_optimizer_args"]["lr"] = learning_rate
ew["generater_optimizer_args"]["lr"] = learning_rate
ew["mel_base"] = "e" #for the diffsinger thingy ig
if pc_nsf == True:
  if torch.cuda.is_available():
    device = torch.cuda.current_device()
    gpu_name = torch.cuda.get_device_name(device)
    if 'A100' or 'Tesla L4':
      ew["crop_mel_frames"] = 48
      ew["batch_size"] = 10
      ew["pc_aug_rate"] = 0.5
    if 'Tesla T4':
      ew["crop_mel_frames"] = 32
      ew["batch_size"] = 10
      ew["pc_aug_rate"] = 0.4

if pc_nsf == False:
  ew["pc_aug"] = False

if data_aug:
    ew["key_aug"] = data_aug
    ew["key_aug_prob"] = data_aug_probability
ew["pl_trainer_accelerator"] = "gpu"
ew["pl_trainer_precision"] = precision
with open("/content/SingingVocoders/configs/ft_hifigan.yaml", "w") as config:
    yaml.dump(ew, config)

print("\n")
print("Configs applied!")

In [None]:
#@title # Preprocess
#@markdown ___

%cd /content/SingingVocoders
!python3 process.py --config /content/SingingVocoders/configs/ft_hifigan.yaml --strx 1
%cd /content

# **Training**

In [None]:
import re
import os
import yaml
os.environ['MPLBACKEND'] = 'Agg'

%cd /content/SingingVocoders
#@title # Training
#@markdown ___
#@markdown Change config_path to path of the config.yaml for resuming | leave blank for training from scratch
config_path = "" # @param {type:"string"}
resume_training = False # @param {type:"boolean"}

#@markdown Model save interval
save_interval = 500 # @param {type:"slider", min:100, max:10000, step:100}
save_interval = int(save_interval / 2)

if config_path:
    config_path = config_path
else:
    config_path = "/content/SingingVocoders/configs/ft_hifigan.yaml"

training_utils_path = "/content/SingingVocoders/utils/training_utils.py"
with open(training_utils_path, "r") as f:
    edit_relative_path = f.read()
new_relative = "relative_path = filepath.relative_to(Path('/content').resolve())"
pattern = r"relative_path\s*=\s*.*"
edit_relative_path = re.sub(pattern, new_relative, edit_relative_path)
with open(training_utils_path, "w") as f:
    f.write(edit_relative_path)

with open(config_path, "r") as config:
    bitch = yaml.safe_load(config)
bitch["val_check_interval"] = save_interval #questionable
with open(config_path, "w") as config:
    yaml.dump(bitch, config)

if resume_training:
    exp_name = os.path.basename(os.path.dirname(config_path))
    save_path = os.path.dirname(os.path.dirname(config_path))
    log = save_path + "/" + exp_name
else:
    log = save_path + "/" + exp_name

logdir = log
%reload_ext tensorboard
%tensorboard --logdir {logdir}
!python3 train.py --config {config_path} --exp_name {exp_name} --work_dir {save_path}

# **ONNX export for OU usage**

In [None]:
import os
%cd /content/SingingVocoders
# Paths
ckpt_path = "" # @param {type:"string"}
ckpt_folder = os.path.dirname(ckpt_path)
ckpt_config = ckpt_folder + "/config.yaml"
name = "" # @param {type:"string"}
export_path = "" # @param {type:"string"}
save_path =  export_path + "/model.ckpt"

!uv pip install -r /content/DiffSinger/requirements-onnx.txt
clear_output()
# Export!
!python3 export_ckpt.py --ckpt_path {ckpt_path} --save_path {save_path}

!python3 /content/DiffSinger/scripts/export.py nsf-hifigan \
    --config "{ckpt_config}" \
    --name "{name}" \
    --ckpt "{save_path}" \
    --out "{ckpt_folder}"
