# 실습: Voice Conversion 모델 동작을 위한 함수 구현

본 실습의 목표는 Voice Conversion을 동작시키기 위해서 필요한 함수들을 구현하여 VC 모델을 동작시키는 것입니다. 구현이 완료된 이후에는 다양한 소스/타겟 음성을 입력하여 음성 변조 결과를 확인할 수 있습니다.



In [1]:
from google.colab import drive
drive.mount('/content/drive')

KeyboardInterrupt: ignored

In [2]:
!pwd

/content


In [3]:
%cd /content/drive/MyDrive/Colab Notebooks/sub2

/content/drive/MyDrive/Colab Notebooks/sub2


In [4]:
!pwd
!ls

/content/drive/MyDrive/Colab Notebooks/sub2
AudioPreprocessing.ipynb  jupyter		 speakers.json
best_model.pth.tar	  language_ids.json	 TTS
config.json		  requirements.txt	 VoiceConversion.ipynb
config_se.json		  SE_checkpoint.pth.tar


In [5]:
!python --version
!ls

Python 3.9.16
AudioPreprocessing.ipynb  jupyter		 speakers.json
best_model.pth.tar	  language_ids.json	 TTS
config.json		  requirements.txt	 VoiceConversion.ipynb
config_se.json		  SE_checkpoint.pth.tar


In [6]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### 모델 다운로드
먼저 미리 학습되어 있는 VC 모델과, 목소리 정보를 추출할수 있는 모델을 다운로드 합니다.

In [7]:
CONFIG_SE_PATH = "config_se.json"
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
# download config 
! gdown https://drive.google.com/uc?id=19cDrhZZ0PfKf2Zhr_ebB-QASRw844Tn1 -O $CONFIG_SE_PATH
# download checkpoint
! gdown https://drive.google.com/uc?id=1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR  -O best_model.pth.tar
# download checkpoint  
! gdown https://drive.google.com/uc?id=17JsW6h6TIh7-LkU2EvB_gnNrPcdBxt7X -O $CHECKPOINT_SE_PATH

rm: cannot remove '/root/.cache/gdown/cookies.json': No such file or directory
Downloading...
From: https://drive.google.com/uc?id=19cDrhZZ0PfKf2Zhr_ebB-QASRw844Tn1
To: /content/drive/MyDrive/Colab Notebooks/sub2/config_se.json
100% 3.49k/3.49k [00:00<00:00, 6.44MB/s]
Access denied with the following error:

 	Cannot retrieve the public link of the file. You may need to change
	the permission to 'Anyone with the link', or have had many accesses. 

You may still be able to access the file from the browser:

	 https://drive.google.com/uc?id=1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR 

Downloading...
From: https://drive.google.com/uc?id=17JsW6h6TIh7-LkU2EvB_gnNrPcdBxt7X
To: /content/drive/MyDrive/Colab Notebooks/sub2/SE_checkpoint.pth.tar
100% 44.6M/44.6M [00:00<00:00, 123MB/s]


### 라이브러리 import
필요한 라이브러리들을 import합니다.

In [28]:
import sys
TTS_PATH = "/home/sung/samsung/TTS"

# add libraries into environment
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
print(sys.path)
import os
import string
import time
import argparse
import json

import numpy as np
import IPython
from IPython.display import Audio


import torch

from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
try:
  from TTS.utils.audio import AudioProcessor
except:
  from TTS.utils.audio import AudioProcessor


from TTS.tts.models import setup_model
from TTS.config import load_config
from TTS.tts.models.vits import *

%load_ext autoreload
%autoreload 2

%reload_ext autoreload
from TTS.tts.utils.speakers import SpeakerManager

from pydub import AudioSegment
import librosa

['/content', '/env/python', '/usr/lib/python39.zip', '/usr/lib/python3.9', '/usr/lib/python3.9/lib-dynload', '', '/usr/local/lib/python3.9/dist-packages', '/usr/lib/python3/dist-packages', '/usr/local/lib/python3.9/dist-packages/IPython/extensions', '/root/.ipython', '/home/sung/samsung/TTS', '/home/sung/samsung/TTS']
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Voice Conversion 모델 세팅
미리 학습된 Voice Conversion 모델을 동작하기 위한 기본적인 세팅을 진행합니다.

In [29]:
# model vars 
MODEL_PATH = 'best_model.pth.tar'
CONFIG_PATH = 'config.json'
TTS_LANGUAGES = "language_ids.json"
TTS_SPEAKERS = "speakers.json"
SAMPLING_RATE=16000
USE_CUDA = torch.cuda.is_available()

# load the config
C = load_config(CONFIG_PATH)
# load the audio processor
ap = AudioProcessor(**C.audio)

speaker_embedding = None

C.model_args['d_vector_file'] = TTS_SPEAKERS
C.model_args['use_speaker_encoder_as_loss'] = False

model = setup_model(C)
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
print(model.language_manager.num_languages, model.embedded_language_dim)
print(model.emb_l)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
# remove speaker encoder
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]

model.load_state_dict(model_weights, strict=False)
model.eval()

if USE_CUDA:
    model = model.cuda()

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:False
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Using model: vits
 > Speaker manager is loaded with 6 speakers: female-en-5, female-en-5
, female-pt-4
, male-en-2, male-en-2
, male-pt-3

3 4
Embedding(3, 4)


### Speaker Encoder 모델 세팅
미리 학습된 Speaker Encoder 모델을 동작하기 위한 기본적인 세팅을 진행합니다.

In [30]:
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400


#실습 진행하기

## Req. 2-2:	Spectrogram을 생성하는 compute_spec() 함수 구현

In [31]:
import librosa

def compute_spec(ref_file):
  ################################################################################
  # TODO: Spectrogram을 생성하는 compute_spec() 함수 구현                             #
  ################################################################################
  sig, sr = librosa.load(ref_file, sr=16000)

  hop_length = 256  # 전체 frame 수
  n_fft = 1024  # frame 하나당 sample 수
  w_length = 1024 # window length

  # STFT
  stft = librosa.stft(sig, n_fft=n_fft, hop_length=hop_length, win_length=w_length, window='hann', pad_mode='reflect')

  # 복소공간 값 절댓값 취하기
  magnitude = np.abs(stft)
   
  spec = torch.FloatTensor(magnitude)
  spec = spec.unsqueeze(0)
  ################################################################################
  # TODO: Spectrogram을 생성하는 compute_spec() 함수 구현                             #
  ################################################################################

  return spec

### Req. 2-2의 구현을 완료한 뒤 테스트 합니다.

In [32]:
####Req. 2-2 test 용도####
test_audio = "./jupyter/source/test.wav"
test_spec = compute_spec(test_audio)
print("shape of the test spectrogram: ", test_spec.shape)
print("max value of the test spectrogram: ", test_spec.max())
print("min value of the test spectrogram: ", test_spec.min())
####Req. 2-2 test 용도####

shape of the test spectrogram:  torch.Size([1, 513, 376])
max value of the test spectrogram:  tensor(157.3934)
min value of the test spectrogram:  tensor(7.8661e-07)


### Req. 2-3의 구현을 완료한 뒤 테스트 합니다.

In [33]:
####Req. 2-3 test 용도####
test_audio = "./jupyter/source/test.wav" 
test_emb = SE_speaker_manager.compute_speaker_embedding(test_audio)

print("shape of the test embedding: ", test_emb.shape)
print("max value of the test embedding: ", test_emb.max())
print("min value of the test embedding: ", test_emb.min())
####Req. 2-3 test 용도####

torch.Size([96161])
torch.Size([1, 96161])
shape of the test embedding:  torch.Size([1, 512])
max value of the test embedding:  tensor(0.2351, device='cuda:0')
min value of the test embedding:  tensor(-0.2167, device='cuda:0')


### Req. 2-4의 구현을 완료한 뒤 테스트 합니다.

In [34]:
####Req. 2-4 test 용도####
test_audios = ["./jupyter/source/test.wav", "./jupyter/source/test2.wav"]
test_emb = SE_speaker_manager.compute_d_vector_from_clip(test_audios)
test_emb = torch.FloatTensor(test_emb).unsqueeze(0)
print("max value of the test embedding: ", test_emb.max())
print("min value of the test embedding: ", test_emb.min())
####Req. 2-4 test 용도####

torch.Size([96161])
torch.Size([1, 96161])
torch.Size([122561])
torch.Size([1, 122561])
max value of the test embedding:  tensor(0.1433)
min value of the test embedding:  tensor(-0.2144)


In [35]:
####Req. 2-4 test 용도####
test_audios = ["./jupyter/source/test2.wav"]
test_emb = SE_speaker_manager.compute_d_vector_from_clip(test_audios)
test_emb = torch.FloatTensor(test_emb).unsqueeze(0)
print("max value of the test embedding: ", test_emb.max())
print("min value of the test embedding: ", test_emb.min())
####Req. 2-4 test 용도####

torch.Size([122561])
torch.Size([1, 122561])
max value of the test embedding:  tensor(0.1345)
min value of the test embedding:  tensor(-0.2572)


# Voice Conversion 모델을 동작합니다.

In [36]:
print("Select target speaker reference audios files:")
target_files = "./jupyter/source/tar.wav"
target_files = [target_files]

Select target speaker reference audios files:


In [37]:
print("Select driving audio file:")
driving_file = "./jupyter/source/src.wav"
driving_file = [driving_file]

Select driving audio file:


## Req. 2-5:	소스 음성과 타켓 음성의 embedding을 추출하는 함수 구현

In [39]:
################################################################################
# TODO: 소스 음성과 타켓 음성의 embedding을 추출하는 함수 구현                            #
################################################################################
target_emb = SE_speaker_manager.compute_d_vector_from_clip(target_files)
target_emb = torch.FloatTensor(target_emb).unsqueeze(0)

driving_emb = SE_speaker_manager.compute_d_vector_from_clip(driving_file)
driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)
################################################################################
# TODO: 소스 음성과 타켓 음성의 embedding을 추출하는 함수 구현                            #
################################################################################

torch.Size([65195])
torch.Size([1, 65195])
torch.Size([54144])
torch.Size([1, 54144])


In [40]:
driving_file = driving_file[0]
driving_spec = compute_spec(driving_file)
y_lengths = torch.tensor([driving_spec.size(-1)])
if USE_CUDA:
    ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
    ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
else:
    ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
    ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()


print("Target Speaker reference Audio")
IPython.display.display(Audio(target_files[0], rate=ap.sample_rate))

print("Source speaker reference Audio")
IPython.display.display(Audio(driving_file, rate=ap.sample_rate))

print("Play the converted audio:")
IPython.display.display(Audio(ref_wav_voc, rate=SAMPLING_RATE))

Target Speaker reference Audio


Source speaker reference Audio


Play the converted audio:
