<a href="https://colab.research.google.com/github/thisismyracle/py-sovits-song-cover/blob/main/SOVITS_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title 0. Install dependencies

!nvidia-smi

!pip install yt_dlp
!pip install ffmpeg
!python3 -m pip install -U demucs
!python -m pip install -U pip wheel
!pip install pydub
%pip install -U so-vits-svc-fork
%pip install -U ipython

In [None]:
#@title 1. Mount to Google Drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title 2. Initialize

def str_escape_space(text: str):
    return '_'.join(text.split())

BASE_PATH = 'content'
YOUTUBE_AUDIO_PATH = 'audio'
SOVITS_MODEL_PATH = '/content/drive/MyDrive/so-vits-svc-fork'

try:
    !mkdir -p {YOUTUBE_AUDIO_PATH}
    !mkdir -p {SOVITS_MODEL_PATH}
    !mkdir -p drive/MyDrive/so-vits-svc-fork
    print('OK')
except Exception as e:
    print(e)


In [None]:
#@title 3a. Download Youtube WAV (voice lines)

from __future__ import unicode_literals
import yt_dlp
import ffmpeg
import sys

voice_name = 'fischl' #@param {type: 'string'}
voice_name = str_escape_space(voice_title)
url = 'https://www.youtube.com/watch?v=XPJ9Dt7mV5w&t=12s&ab_channel=Trashy-Kun' #@param {type:'string'}

ydl_options = {
    'format': 'bestaudio/best',
    'postprocessors': [
        {
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav'
        }
    ],
    'outtmpl': f'{YOUTUBE_AUDIO_PATH}/{voice_name}'
}

voice_path = ''

try:
    ydl = yt_dlp.YoutubeDL(ydl_options)
    ydl.download([url])
    voice_path = f'{YOUTUBE_AUDIO_PATH}/{voice_name}.wav'
    
    print('OK')
except Exception as e:
    print(e)


In [None]:
#@title 3b. Locate your own voice recording

voice_name = 'myracle' #@param {type: 'string'}
voice_name = str_escape_space(voice_name)
voice_path  = '/content/audio/myracle.wav' #@param {type: 'string'}

In [None]:
#@title 4. Cleanse voice noises

import subprocess

command = f'demucs --two-stems=vocals {voice_path}'
result = subprocess.run(command.split(), stdout=subprocess.PIPE)

print(result.stdout.decode())

In [None]:
#@title 5. Split the voice into pieces

dataset_path = f'dataset_raw/{voice_name}'

!mkdir -p {dataset_path}

from datetime import datetime, timedelta
from scipy.io import wavfile
from tqdm import tqdm
import argparse
import json
import numpy as np
import os

def get_time(sec):
    if sec < 0:
        return 0
    else:
        sec = timedelta(seconds=float(sec))
        d = datetime(1, 1, 1) + sec

        h_str = str(d.hour).zfill(2)
        m_str = str(d.minute).zfill(2)
        s_str = str(d.second).zfill(2)

        return f'{h_str}:{m_str}:{s_str}.001'

def get_total_time(sec):
    sec = timedelta(seconds=float(sec))
    d = datetime(1, 1, 1) + sec
    delta = f'{str(d.hour)}:{str(d.minute)}:{str(d.second)}'

    return delta

def get_windows(signal, window_size, step_size):
    signal_len = len(signal)
    for i_start in range(0, signal_len, step_size):
        i_end = i_start + window_size

        if i_end >= signal_len:
            break
        
        yield signal[i_start:i_end]

def get_energy(samples):
    return np.sum(np.power(samples, 2.)) / float(len(samples))

def get_rising_edges(binary_signal):
    prev = 0
    index = 0

    for x in binary_signal:
        if x and not prev:
            yield index
        
        prev = x
        index += 1



input_path = f'/content/separated/htdemucs/{voice_name}/vocals.wav'
output_dir = f'/content/dataset/{voice_name}'

min_silence_len = 0.6
silence_threshold = 1e-4
step_duration = 0.003
window_duration = min_silence_len
output_path_prefix = os.path.splitext(os.path.basename(input_path))[0]
is_dry_run = True

print('Splitting {} where energy is below {}% for {}s long.'.format(
    voice_path,
    silence_threshold * 100.,
    window_duration
))



input_data = wavfile.read(input_path, mmap=True)
sample_rate, samples = input_data

max_amplitude = np.iinfo(samples.dtype).max
max_energy = get_energy([max_amplitude])
print(f'Max amplitude: {max_amplitude} Hz')
print(f'Max energy   : {max_energy * 100.}%')


window_size = int(window_duration * sample_rate)
step_size = int(step_duration * sample_rate)
signal_windows = get_windows(samples, window_size, step_size)

samples_len = len(samples)
tqdm_signal_windows = tqdm(signal_windows, total=int(samples_len / float(step_size)))
window_energy = (get_energy(w) / max_energy for w in tqdm_signal_windows)

window_silence = (e > silence_threshold for e in window_energy)
rising_edges = get_rising_edges(window_silence)
cut_times = (r * step_duration for r in rising_edges)



print('Finding silences...')
cut_samples = [int(t * sample_rate) for t in cut_times]
cut_samples.append(-1)

cut_samples_len = len(cut_samples)
cut_ranges = [(i, cut_samples[i], cut_samples[i+1]) for i in range(cut_samples_len - 1)]

video_sub = {str(i): [str(get_time(cut_samples[i] / sample_rate)),
                      str(get_time(cut_samples[i+1] / sample_rate))]
             for i in range(cut_samples_len - 1)}

tqdm_cut_ranges = tqdm(cut_ranges)
for i, start, stop in tqdm_cut_ranges:
    output_path = '{}_{:03d}.wav'.format(os.path.join(output_dir, output_path_prefix), i)

    if not is_dry_run:
        print(f'Writing file {output_path}')
        wavfile.write(output_path, sample_rate, samples[start:stop])
    else:
        pass

with open(f'{output_dir}/{output_path_prefix}.json', 'w') as output:
    json.dump(video_sub, output)

print('OK')

In [None]:
#@title 6. Training preparation

import os

SOVITS_MODEL_PATH = '/content/drive/MyDrive/so-vits-svc-fork'
NEW_SOVITS_MODEL_PATH = f'{SOVITS_MODEL_PATH}/{voice_name}'

if not os.path.exists(SOVITS_MODEL_PATH):
    !mkdir {SOVITS_MODEL_PATH}

if not os.path.exists(NEW_SOVITS_MODEL_PATH):
    !mkdir {NEW_SOVITS_MODEL_PATH}

f0_method = 'dio' #@param ['crepe', 'crepe-tiny', 'dio', 'harvest', 'parselmouth']

!svc pre-resample
!svc pre-config
!cp configs/44k/config.json {NEW_SOVITS_MODEL_PATH}
!svc pre-hubert -fm {f0_method}

In [None]:
#@title 7. Training

%load_ext tensorboard
%tensorboard --logdir {NEW_SOVITS_MODEL_PATH}/logs/44k
!svc train --model-path {NEW_SOVITS_MODEL_PATH}/logs/44k