<a href="https://colab.research.google.com/github/thisismyracle/py-sovits-song-cover/blob/main/SOVITS_Inference.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#@title 0. Install dependencies

!nvidia-smi

!pip install yt_dlp
!pip install ffmpeg
!python3 -m pip install -U demucs
!python -m pip install -U pip wheel
!pip install pydub
%pip install -U so-vits-svc-fork
%pip install -U ipython

In [None]:
#@title 1. Mount to Google Drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
#@title 2. Initialize

def str_escape_space(text: str):
    return '_'.join(text.split())

BASE_PATH = 'content'
YOUTUBE_AUDIO_PATH = 'audio'
COVER_ID = 0
TEMP_SOVITS_MODEL_PATH = 'sovits_model'

try:
    !mkdir -p {YOUTUBE_AUDIO_PATH}
    !mkdir -p {TEMP_SOVITS_MODEL_PATH}
    print('OK')
except Exception as e:
    print(e)


In [None]:
#@title 3. Download Youtube WAV

from __future__ import unicode_literals
import yt_dlp
import ffmpeg
import sys

song_title = 'haiiro to ao' #@param {type: 'string'}
song_title = str_escape_space(song_title)
url = 'https://www.youtube.com/watch?v=9nCkUgTO5x4&ab_channel=kobasolo' #@param {type:'string'}

ydl_options = {
    'format': 'bestaudio/best',
    'postprocessors': [
        {
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'wav'
        }
    ],
    'outtmpl': f'{YOUTUBE_AUDIO_PATH}/{song_title}'
}

try:
    ydl = yt_dlp.YoutubeDL(ydl_options)
    ydl.download([url])
    print('OK')
except Exception as e:
    print(e)


In [None]:
#@title 4. Separate vocal and instrument

import subprocess

audio_path = f'/{BASE_PATH}/{YOUTUBE_AUDIO_PATH}/{song_title}.wav'
command = f'demucs --two-stems=vocals {audio_path}'
result = subprocess.run(command.split(), stdout=subprocess.PIPE)

print(result.stdout.decode())

In [None]:
#@title 5a. Inference with pretrained model (download)

import os
from IPython.display import Audio

voice_name = 'alice' #@param {type: 'string'}
voice_name = str_escape_space(voice_name)
model_pth_url = 'https://huggingface.co/spaces/zomehwh/sovits-models/resolve/main/models/alice/alice.pth' #@param{type: 'string'}
model_config_url = 'https://huggingface.co/spaces/zomehwh/sovits-models/resolve/main/models/alice/config.json' #@param{type: 'string'}
model_name = model_pth_url.split('/')[-1].split('.')[0]

if not (os.path.isfile(f'/{BASE_PATH}/{TEMP_SOVITS_MODEL_PATH}/alice/alice.pth') or os.path.isfile(f'/{BASE_PATH}/{TEMP_SOVITS_MODEL_PATH}/alice/alice.json')):
    !wget -N 'https://huggingface.co/spaces/zomehwh/sovits-models/resolve/main/models/alice/alice.pth' -P /{BASE_PATH}/{SOVITS_MODEL_PATH}/{model_name}/
    !wget -N 'https://huggingface.co/spaces/zomehwh/sovits-models/resolve/main/models/alice/config.json' -P /{BASE_PATH}/{SOVITS_MODEL_PATH}/{model_name}/

audio_path = f'/content/separated/htdemucs/{song_title}/vocals'
model_path = f'/{BASE_PATH}/{TEMP_SOVITS_MODEL_PATH}/{model_name}/alice.pth'
config_path = f'/{BASE_PATH}/{TEMP_SOVITS_MODEL_PATH}/{model_name}/config.json'
pitch = 0 #@param {type: 'integer'}

!svc infer {audio_path}.wav -c {config_path} -m {model_path} -na -t {pitch}

#@markdown Change according your voice tone and the original singer's tone.

#@markdown if your tone are SAME with the original, pitch = 0,

#@markdown if your voice are HIGHER than the original, pitch = -12

#@markdown if your voice are LOWER than the original, pitch = 12


In [None]:
#@title 5b. Inference with your own model (local)

import os
from IPython.display import Audio

voice_name = 'fischl' #@param {type: 'string'}
voice_name = str_escape_space(voice_name)
audio_path = f'/content/separated/htdemucs/{song_title}/vocals'
model_path = '/content/drive/MyDrive/so-vits-svc-fork/fischl/logs/44k/G_100.pth' #@param {type:'string'}
config_path = '/content/drive/MyDrive/so-vits-svc-fork/fischl/logs/44k/config.json' #@param {type:'string'}
pitch = 0 #@param {type: 'integer'}

!svc infer {audio_path}.wav -c {config_path} -m {model_path} -na -t {pitch}

#@markdown Change according your voice tone and the original singer's tone.

#@markdown if your voice are SAME with the original, pitch = 0,

#@markdown if your voice are LOWER than the original, pitch = -12

#@markdown if your voice are HIGHER than the original, pitch = 12


In [None]:
#@title 6. Combine vocal AI with instrument

from pydub import AudioSegment

COVER_ID += 1

vocal_path = f'/content/separated/htdemucs/{song_title}/vocals.out.wav'
instrument_path = f'/content/separated/htdemucs/{song_title}/no_vocals.wav'
cover_filename = f'/{BASE_PATH}/{voice_name}-{song_title}-' + str(COVER_ID) 

sound1 = AudioSegment.from_file(vocal_path)
sound2 = AudioSegment.from_file(instrument_path)

combined = sound1.overlay(sound2)
combined.export(f'{cover_filename}.wav', format='wav')

In [None]:
#@title 7. Convert wav to mp3 (optional)

cover_path = f'{cover_filename}.wav'
cover_path_mp3 = f'{cover_filename}.mp3'

AudioSegment.from_wav(cover_path).export(cover_path_mp3, format='mp3')