# 실습: Voice Conversion 모델 동작을 위한 함수 구현

본 실습의 목표는 Voice Conversion을 동작시키기 위해서 필요한 함수들을 구현하여 VC 모델을 동작시키는 것입니다. 구현이 완료된 이후에는 다양한 소스/타겟 음성을 입력하여 음성 변조 결과를 확인할 수 있습니다.



### 모델 다운로드
먼저 미리 학습되어 있는 VC 모델과, 목소리 정보를 추출할수 있는 모델을 다운로드 합니다.

In [2]:
CONFIG_SE_PATH = "config_se.json"
CHECKPOINT_SE_PATH = "SE_checkpoint.pth.tar"
# # download config 
# ! gdown --id  19cDrhZZ0PfKf2Zhr_ebB-QASRw844Tn1 -O $CONFIG_SE_PATH
# # download checkpoint  
# ! gdown --id   17JsW6h6TIh7-LkU2EvB_gnNrPcdBxt7X -O $CHECKPOINT_SE_PATH
# # download checkpoint
# ! gdown --id 1sgEjHt0lbPSEw9-FSbC_mBoOPwNi87YR -O best_model.pth.tar

### 라이브러리 import
필요한 라이브러리들을 import합니다.

In [3]:
import sys
TTS_PATH = "C:\\Users\\SSAFY\\Desktop\\SSAFY\\projects\\2special\\sources\\ai-speech-skeleton\\sub2\\SubPJT2_Voice_Conversion\\TTS"

# add libraries into environment
sys.path.append(TTS_PATH) # set this if TTS is not installed globally
print(sys.path)
import os
import string
import time
import argparse
import json

import numpy as np
import IPython
from IPython.display import Audio


import torch

from TTS.tts.utils.synthesis import synthesis
from TTS.tts.utils.text.symbols import make_symbols, phonemes, symbols
try:
  from TTS.utils.audio import AudioProcessor
except:
  from TTS.utils.audio import AudioProcessor


from TTS.tts.models import setup_model
from TTS.config import load_config
from TTS.tts.models.vits import *

from TTS.tts.utils.speakers import SpeakerManager
from pydub import AudioSegment
import librosa

['c:\\Users\\SSAFY\\Desktop\\SSAFY\\projects\\2special\\sources\\ai-speech-skeleton\\sub2\\SubPJT2_Voice_Conversion', 'c:\\Users\\SSAFY\\AppData\\Local\\Programs\\Python\\Python39\\python39.zip', 'c:\\Users\\SSAFY\\AppData\\Local\\Programs\\Python\\Python39\\DLLs', 'c:\\Users\\SSAFY\\AppData\\Local\\Programs\\Python\\Python39\\lib', 'c:\\Users\\SSAFY\\AppData\\Local\\Programs\\Python\\Python39', '', 'C:\\Users\\SSAFY\\AppData\\Roaming\\Python\\Python39\\site-packages', 'C:\\Users\\SSAFY\\AppData\\Roaming\\Python\\Python39\\site-packages\\win32', 'C:\\Users\\SSAFY\\AppData\\Roaming\\Python\\Python39\\site-packages\\win32\\lib', 'C:\\Users\\SSAFY\\AppData\\Roaming\\Python\\Python39\\site-packages\\Pythonwin', 'c:\\Users\\SSAFY\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages', 'C:\\Users\\SSAFY\\Desktop\\SSAFY\\projects\\2special\\sources\\ai-speech-skeleton\\sub2\\SubPJT2_Voice_Conversion\\TTS']


### Voice Conversion 모델 세팅
미리 학습된 Voice Conversion 모델을 동작하기 위한 기본적인 세팅을 진행합니다.

In [4]:
# model vars 
MODEL_PATH = 'best_model.pth.tar'
CONFIG_PATH = 'config.json'
TTS_LANGUAGES = "language_ids.json"
TTS_SPEAKERS = "speakers.json"
SAMPLING_RATE=16000
USE_CUDA = torch.cuda.is_available()

# load the config
C = load_config(CONFIG_PATH)
# load the audio processor
ap = AudioProcessor(**C.audio)

speaker_embedding = None

C.model_args['d_vector_file'] = TTS_SPEAKERS
C.model_args['use_speaker_encoder_as_loss'] = False

model = setup_model(C)
model.language_manager.set_language_ids_from_file(TTS_LANGUAGES)
# print(model.language_manager.num_languages, model.embedded_language_dim)
# print(model.emb_l)
cp = torch.load(MODEL_PATH, map_location=torch.device('cpu'))
# remove speaker encoder
model_weights = cp['model'].copy()
for key in list(model_weights.keys()):
  if "speaker_encoder" in key:
    del model_weights[key]

model.load_state_dict(model_weights)
model.eval()

if USE_CUDA:
    model = model.cuda()

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:True
 | > mel_fmin:0
 | > mel_fmax:None
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:45
 | > do_sound_norm:False
 | > do_amp_to_db_linear:False
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:2.718281828459045
 | > hop_length:256
 | > win_length:1024
 > Using model: vits
 > Speaker manager is loaded with 6 speakers: female-en-5, female-en-5
, female-pt-4
, male-en-2, male-en-2
, male-pt-3



### Speaker Encoder 모델 세팅
미리 학습된 Speaker Encoder 모델을 동작하기 위한 기본적인 세팅을 진행합니다.

In [5]:
SE_speaker_manager = SpeakerManager(encoder_model_path=CHECKPOINT_SE_PATH, encoder_config_path=CONFIG_SE_PATH, use_cuda=USE_CUDA)

 > Setting up Audio Processor...
 | > sample_rate:16000
 | > resample:False
 | > num_mels:64
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:20
 | > fft_size:512
 | > power:1.5
 | > preemphasis:0.97
 | > griffin_lim_iters:60
 | > signal_norm:False
 | > symmetric_norm:False
 | > mel_fmin:0
 | > mel_fmax:8000.0
 | > spec_gain:20.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:False
 | > do_trim_silence:False
 | > trim_db:60
 | > do_sound_norm:False
 | > do_amp_to_db_linear:True
 | > do_amp_to_db_mel:True
 | > stats_path:None
 | > base:10
 | > hop_length:160
 | > win_length:400


#실습 진행하기

## Req. 2-2:	Spectrogram을 생성하는 compute_spec() 함수 구현

In [20]:
import librosa
from librosa.filters import mel as librosa_mel_fn

def compute_spec(ref_file):
  N_FFT = 1024
  SR = 16000
  # print(ref_file)
  # sig,sr =librosa.load(ref_file,sr=SR)
  # stft = librosa.stft(y=sig,n_fft=N_FFT,hop_length=256,pad_mode="reflect", window="hann")
  # magnitude = np.abs(stft)
  # mel_basis = librosa_mel_fn(SR,n_fft=N_FFT,n_mels=128)
  # mel = np.dot(a=mel_basis,b=magnitude)
  # mel = 20 * np.log10(np.maximum(1e-5, mel))

  # # normalize
  # mel = np.clip((mel - 20 + 100) / 100, 1e-8, 1)
  # mel = np.float32(mel)
  # mel = torch.from_numpy(mel)
  # return mel[None,:]
  
  y, sr = librosa.load(path=ref_file, sr=SR)
  spec = np.abs(librosa.stft(y=y, n_fft=1024, hop_length=256,win_length=1024, pad_mode="reflect", window="hann"))
  spec = np.float32(spec)
  spec = torch.from_numpy(spec)
  # y, sr = librosa.load(path=ref_file, sr=SR)
  # stft=librosa.stft(y=y, n_fft=1024, hop_length=256,win_length=1024, pad_mode="reflect", window="hann")
  # magnitude = np.abs(stft)
  # mel_basis = librosa_mel_fn(SR,n_fft=N_FFT,n_mels=513)
  # mel = np.dot(a=mel_basis,b=magnitude)
  # spec = np.abs(mel)
  # spec = np.float32(spec)
  # spec = torch.from_numpy(spec)
  return spec[None,:]

### Req. 2-2의 구현을 완료한 뒤 테스트 합니다.

In [7]:
####Req. 2-2 test 용도####
test_audio = "./jupyter/source/test.wav"
test_spec = compute_spec(test_audio)
print("shape of the test spectrogram: ", test_spec.shape)
print("max value of the test spectrogram: ", test_spec.max())
print("min value of the test spectrogram: ", test_spec.min())
####Req. 2-2 test 용도####

shape of the test spectrogram:  torch.Size([1, 513, 376])
max value of the test spectrogram:  tensor(21.7452)
min value of the test spectrogram:  tensor(0.)




### Req. 2-3의 구현을 완료한 뒤 테스트 합니다.

In [8]:
####Req. 2-3 test 용도####
test_audio = "./jupyter/source/test.wav" 
test_emb = SE_speaker_manager.compute_speaker_embedding(test_audio)
print("shape of the test embedding: ", test_emb.shape)
print("max value of the test embedding: ", test_emb.max())
print("min value of the test embedding: ", test_emb.min())
####Req. 2-3 test 용도####

shape of the test embedding:  torch.Size([1, 512])
max value of the test embedding:  tensor(0.2351, device='cuda:0')
min value of the test embedding:  tensor(-0.2167, device='cuda:0')


### Req. 2-4의 구현을 완료한 뒤 테스트 합니다.

In [9]:
####Req. 2-4 test 용도####
test_audios = ["./jupyter/source/test.wav", "./jupyter/source/test2.wav"]
test_emb = SE_speaker_manager.compute_d_vector_from_clip(test_audios)
print("max value of the test embedding: ", test_emb.max())
print("min value of the test embedding: ", test_emb.min())
####Req. 2-4 test 용도####

max value of the test embedding:  tensor(0.1433, device='cuda:0')
min value of the test embedding:  tensor(-0.2144, device='cuda:0')


In [10]:
####Req. 2-4 test 용도####
test_audios = ["./jupyter/source/test2.wav"]
test_emb = SE_speaker_manager.compute_d_vector_from_clip(test_audios)
print("max value of the test embedding: ", test_emb.max())
print("min value of the test embedding: ", test_emb.min())
####Req. 2-4 test 용도####

max value of the test embedding:  tensor(0.1345, device='cuda:0')
min value of the test embedding:  tensor(-0.2572, device='cuda:0')


# Voice Conversion 모델을 동작합니다.

In [11]:
print("Select target speaker reference audios files:")
target_files = "./jupyter/source/test2.wav"
target_files = [target_files]
target_emb = SE_speaker_manager.compute_d_vector_from_clip(target_files)
# target_emb = torch.FloatTensor(target_emb).unsqueeze(0)


Select target speaker reference audios files:


In [15]:
print("Select driving audio file:")
driving_file = "./jupyter/source/Ankylosaurus.wav"
driving_file = [driving_file]
driving_emb = SE_speaker_manager.compute_d_vector_from_clip(driving_file)
# driving_emb = torch.FloatTensor(driving_emb).unsqueeze(0)


Select driving audio file:


## Req. 2-5:	소스 음성과 타켓 음성의 embedding을 추출하는 함수 구현

In [13]:
def get_embeddings():
    pass
################################################################################
# TODO: 소스 음성과 타켓 음성의 embedding을 추출하는 함수 구현                            #
################################################################################

In [21]:
print(driving_file[0])
driving_spec = compute_spec(driving_file[0])
print(driving_spec.cuda().shape)
y_lengths = torch.tensor([driving_spec.size(-1)])

print(y_lengths)

if USE_CUDA:
    ref_wav_voc, _, _ = model.voice_conversion(driving_spec.cuda(), y_lengths.cuda(), driving_emb.cuda(), target_emb.cuda())
    ref_wav_voc = ref_wav_voc.squeeze().cpu().detach().numpy()
else:
    ref_wav_voc, _, _ = model.voice_conversion(driving_spec, y_lengths, driving_emb, target_emb)
    ref_wav_voc = ref_wav_voc.squeeze().detach().numpy()


print("Target Speaker reference Audio")
IPython.display.display(Audio(target_files[0], rate=ap.sample_rate))

print("Source speaker reference Audio")
IPython.display.display(Audio(driving_file[0], rate=ap.sample_rate))

print("Play the converted audio:")
IPython.display.display(Audio(ref_wav_voc, rate=SAMPLING_RATE))

./jupyter/source/Ankylosaurus.wav
./jupyter/source/Ankylosaurus.wav
torch.Size([1, 128, 1459])
tensor([1459])


RuntimeError: Given groups=1, weight of size [192, 513, 1], expected input[1, 128, 1459] to have 513 channels, but got 128 channels instead