# SoftVC Acoustic Model
[![Generic badge](https://img.shields.io/badge/GitHub-softVCam-9cf.svg)][github]
[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)][notebook]

Author: [tarepan]

[github]:https://github.com/tarepan/softVC_AM
[notebook]:https://colab.research.google.com/github/tarepan/softVC_AM/blob/main/softVC_AM.ipynb
[tarepan]:https://github.com/tarepan

## Colab Check
Check
- Google Colaboratory runnning time
- GPU type
- Python version
- CUDA version

In [None]:
!cat /proc/uptime | awk '{print $1 /60 /60 /24 "days (" $1 "sec)"}'
!head -n 1 /proc/driver/nvidia/gpus/**/information
!python --version
!pip show torch | sed '2!d'
!/usr/local/cuda/bin/nvcc --version | sed '4!d'

## Setup

Install the package from `tarepan/softVC_AM` public repository

In [None]:
# GoogleDrive
from google.colab import drive
drive.mount('/content/gdrive')

# Dedicated dependencies install
# !pip install "torch==1.10.0" -q      # Based on your PyTorch environment
# !pip install "torchaudio==0.10.0" -q # Based on your PyTorch environment

!git clone https://github.com/tarepan/softVC_AM
%cd softVC_AM
# repository install
# !pip uninstall softvcam -y -q
# !pip install git+https://github.com/tarepan/softVC_AM -q

!pip install git+https://github.com/tarepan/softVC_hubert
!pip install git+https://github.com/tarepan/speechcorpusy.git
!pip install git+https://github.com/tarepan/extorch

## Training

### Step 0: Data

In [None]:
import speechcorpusy

corpus = speechcorpusy.load_preset("LJ", root="/content/gdrive/MyDrive/ML_data")
corpus.get_contents()


### Step 1: Preprocessing

In [None]:
!mkdir data_softVC
!mkdir data_softVC/train

#### Data paths
Make list of wav file path relative to data root, under `./data_softVC/train`.

In [None]:
from pathlib import Path
import speechcorpusy

corpus_id, corpus_name = "LJ", "LJSpeech"
in_wav_dir = Path(f"tmp/corpuses/{corpus_name}")
corpus = speechcorpusy.load_preset(corpus_id, root="/content/gdrive/MyDrive/ML_data")
all_utterances = corpus.get_identities()
uttrs_train, uttrs_val = all_utterances[:-20], all_utterances[-20:]

with open("data_softVC/train/train.txt", "a", encoding="utf-8") as f:
    for item in uttrs_train:
        path_str = str(corpus.get_item_path(item).relative_to(in_wav_dir))
        f.write(path_str+"\n")

with open("data_softVC/train/validation.txt", "a", encoding="utf-8") as f:
    for item in uttrs_val:
        path_str = str(corpus.get_item_path(item).relative_to(in_wav_dir))
        f.write(path_str+"\n")


#### Wave-to-Unit

In [None]:
# For only CPC
# !pip install joblib
# !wget https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/cpc_big_ll6kh_top_ctc.pt
# !wget https://dl.fbaipublicfiles.com/textless_nlp/gslm/cpc/km100/km.bin
# !pip uninstall "scikit-learn" -q -y
# !pip install "scikit-learn==0.24.2"

In [None]:
# For only vq-wav2vec
# !pip install git+https://github.com/tarepan/fairseq.git
# !pip install git+https://github.com/tarepan/s3prl.git

In [None]:
from pathlib import Path
from hubert.encode import encode_dataset
from units.encode import encode_cpc_dataset
from units.vqwav2vec import encode_vqwav2vec_dataset


#### Change here for your data ###########################
soft_disc = "soft"
# soft_disc = "discrete"
corpus_name = "LJSpeech"
in_wav_dir = Path(f"tmp/corpuses/{corpus_name}")    # Directory containing .wav input files
##########################################################

out_unit_dir = Path(f"./data_softVC/train/{soft_disc}") # Directory in which new unit series will be saved
encode_dataset(soft_disc, in_wav_dir, out_unit_dir, ".wav")
# encode_cpc_dataset(soft_disc, in_wav_dir, out_unit_dir, ".wav") # CPC discrete
# encode_vqwav2vec_dataset(soft_disc, in_wav_dir, out_unit_dir, ".wav") # VQ-wav2vec discrete

#### Wave-to-Mel

In [None]:
!python mels.py "tmp/corpuses/LJSpeech" "./data_softVC/train/mels"

In [None]:
# Launch TensorBoard
%load_ext tensorboard
%tensorboard --logdir /content/gdrive/MyDrive/ML_results/softVC_AM

!python train.py data_softVC/train /content/gdrive/MyDrive/ML_results/softVC_AM
# !python train.py data_softVC/train /content/gdrive/MyDrive/ML_results/softVC_AM --discrete
# !python train.py data_softVC/train /content/gdrive/MyDrive/ML_results/softVC_AM --causal
# !python train.py data_softVC/train /content/gdrive/MyDrive/ML_results/softVC_AM --discrete --no-upsampling # For CPC
# !python train.py data_softVC/train /content/gdrive/MyDrive/ML_results/softVC_AM --resume=/content/gdrive/MyDrive/ML_results/softVC_AM/... # Resume


## Inference

### Unit-to-Mel

#### Step 0: Wave-to-Unit
If you do NOT have unit .npy, first generate it.

In [None]:
!pip install git+https://github.com/tarepan/softVC_hubert

In [None]:
from pathlib import Path
from hubert.encode import encode_dataset
from units.encode import encode_cpc_dataset

#### Change here for your data ###########################
soft_disc = "soft"
# soft_disc = "discrete"
# soft_disc = "cpc_discrete"
in_wav_dir = Path("./data_softVC/wavs")    # Directory containing .wav input files
out_unit_dir = Path("./data_softVC/units") # Directory in which new unit series will be saved
##########################################################

out_unit_dir.mkdir(parents=True, exist_ok=True)
encode_dataset(soft_disc, in_wav_dir, out_unit_dir, ".wav")
# encode_cpc_dataset(soft_disc, in_wav_dir, out_unit_dir, ".wav") # CPC discrete

#### Step 1: Unit-to-Mel

In [None]:
!python ./generate.py soft     <in-unit-dir> <out-mel-dir>
# !python ./generate.py discrete <in-unit-dir> <out-mel-dir>
# !python ./generate.py soft "./data_softVC/units" "./data_softVC/mspcs" # For example

### From Checkpoint
1. Make unit series from input waveform
2. Apply A2O VC with trained model in the checkpoint
3. Make waveform from the mels with pre-trained (LJSpeech-optimized) vocoder

In [None]:
import soundfile as sf
import torch, torchaudio
import resampy

from acoustic.model import hubert_soft, hubert_discrete


# 'Hubert soft' setup
soft_disc = "soft"
# soft_disc = "discrete"
path_wav_origin = "<your_audio_data>.wav"
path_am_ckpt = "<your_AM_checkpoint>.pt"

# Pre-trained models
hubert = torch.hub.load("bshall/hubert:main", f"hubert_{soft_disc}").cuda()
hifigan = torch.hub.load("bshall/hifigan:main", f"hifigan_hubert_{soft_disc}").cuda()

# Trained AM from the checkpoint
acoustic = hubert_soft(pretrained=False) if soft_disc == "soft" else hubert_discrete(pretrained=False) if soft_disc == "discrete" else ""
ckpt = torch.load(path_am_ckpt, map_location={"cuda:0": f"cuda:0"})
acoustic.load_state_dict(ckpt["acoustic-model"])
acoustic = acoustic.cuda()

# Load the source audio
i_wave_tmp, sr_source = sf.read(path_wav_origin)
path_wav_resampled = "./resampled.wav"
sf.write(path_wav_resampled, resampy.resample(i_wave_tmp, sr_source, 16000), 16000)
i_wave, sr = torchaudio.load(path_wav_resampled)
assert sr == 16000
i_wave = i_wave.unsqueeze(0).cuda()

# Generation
with torch.inference_mode():
    # Wave-to-Unit
    unit_series = hubert.units(i_wave)

    # Unit-to-Mel
    if soft_disc == "discrete":
        ## (T, ) -> (1, T) for discrete
        unit_series = unit_series.unsqueeze(0)
    mspc_series = acoustic.generate(unit_series).transpose(1, 2)

    # Mel-to-Wave
    o_wave = hifigan(mspc_series)

# Display
from IPython.display import Audio, display

print("==========\nOrigin")
display(Audio(i_wave.squeeze().cpu(), rate=16000))
print("==========\nVC")
display(Audio(o_wave.squeeze().cpu(), rate=16000))