This Colab notebook supports inference with a [so-vits-svc-fork-4.0](https://github.com/34j/so-vits-svc-fork) model

# Preparations

In [None]:
#@title Check GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

In [None]:
#@title Install dependencies
#@markdown pip may fail to resolve dependencies and raise ERROR, but it can be ignored.
!python -m pip install -U pip wheel
# %pip install -U ipython
%pip install -U so-vits-svc-fork
%pip install -U demucs
# !apt install ffmpeg
%pip install yt-dlp
import subprocess
i = 1 #flag for renaming copies of songs
from IPython.display import Audio, display

In [None]:
#@title Download pretrained model
# Model of Chen Zhuoxuan is available at https://huggingface.co/melicat/so-vits-svc-4.0/tree/main/ChenZhuoxuan
!wget -N 'https://huggingface.co/melicat/so-vits-svc-4.0/resolve/main/ChenZhuoxuan/G_30400.pth'
!wget -N 'https://huggingface.co/melicat/so-vits-svc-4.0/resolve/main/ChenZhuoxuan/config.json'

In [None]:
# If you have the song in your computer, upload to session storage or Google Drive.
# from google.colab import drive
# drive.mount('/content/drive')

# Infer a song

In [None]:
#@title Download input song from YouTube
YouTubeLink = 'https://youtu.be/FzZSADo_uxA' #@param {'type': 'string'}
#@markdown Singing clips with very simple accompaniment (e.g. only one guitar or piano) are preferred.
!yt-dlp {YouTubeLink}
videoName = subprocess.getoutput(f'yt-dlp --print filename {YouTubeLink}')
!ffmpeg -y -i "{videoName}" -ar 44100 song.mp3

In [None]:
#@title Separation into vocals/accompaniment
#@markdown Here, we use [demucs](https://github.com/facebookresearch/demucs) to isolate the vocals. For GUI and potentially better separation, use [UVR](https://ultimatevocalremover.com) (Extract vocals with 3_HP-Vocal-UVR and remove backing vocals with 5_HP-Karaoke-UVR).
# # spleeter is an alternative to demucs, but may cause dependency conflicts with so-vits-svc-fork
# %pip install spleeter
# !spleeter separate -o /content song.mp3

demucs_model = 'htdemucs_ft' #@param ['htdemucs', 'htdemucs_ft', 'htdemucs_6s', 'htdemucs_mmi', 'mdx_extra']
!demucs -n {demucs_model} --two-stems=vocals song.mp3

In [None]:
#@title Inference
inputVocals = f'/content/separated/{demucs_model}/song/vocals.wav'
F0_METHOD = 'crepe' #@param ['crepe', 'crepe-tiny', 'parselmouth', 'dio', 'harvest']
transpose = 0 #@param {'type': 'integer'}
#@markdown Sometimes the song needs to be up-transposed to fit AI Zhuoxuan's vocal range, with the accompaniment pitch-shifted accordingly.

!svc infer {inputVocals} -fm {F0_METHOD} -t {transpose} -na --speaker 'czx' -m /content/G_30400.pth -c /content/config.json
# Change -na to -a to enable auto-predict-f0 (but the result may become out of tune)

In [None]:
#@title Combine output vocals with accompaniment
outputVocals = f'/content/separated/{demucs_model}/song/vocals.out.wav'
accompaniment_0 = f'/content/separated/{demucs_model}/song/no_vocals.wav'

if transpose==0:
  accompaniment = accompaniment_0
else:
  accompaniment = f'/content/separated/{demucs_model}/song/no_vocals_trans.wav'
  r = pow(2,transpose/12)
  !ffmpeg -y -i {accompaniment_0} -af asetrate=44100*{r},aresample=44100,atempo=1/{r} {accompaniment}

outputSong = 'outputSong.mp3'
!ffmpeg -y -i {outputVocals} -i {accompaniment} -filter_complex amerge=inputs=2 -ac 2 {outputSong}

# Numbered copies of the files are saved before the next inference to avoid inadvertent overwriting
rename = f'song{i}'
!cp outputSong.mp3 'output{rename}.mp3'
!cp song.mp3 '{rename}.mp3'
!cp -R /content/separated/{demucs_model}/song /content/separated/{demucs_model}/{rename}
i+=1

# Play output

In [None]:
# play the latest output
print(f'Playing output{rename}.mp3')
display(Audio(f'output{rename}.mp3', autoplay=True))

In [None]:
# # play one of the files
# !ffmpeg -y -i /content/separated/htdemucs_ft/song2/vocals.wav temp.mp3
# display(Audio('temp.mp3', autoplay=True))