<a href="https://colab.research.google.com/github/yuraoh12/AI-bigdata/blob/main/%EC%9E%90%EC%97%B0%EC%96%B4%EC%B2%98%EB%A6%AC%20%EC%82%AC%ED%88%AC%EB%A6%AC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt

from model.model_inference_v2 import Tacotron2
from vocoder.model.waveglow import WaveGlow
from vocoder.denoiser_librosa import Denoiser
from korean_text.korean_cleaner_cls import KoreanCleaner

from text import text_to_sequence, sequence_to_text
from utils.util import mode, to_var, to_arr

from demo_proc import _convert_to_pcm16

ModuleNotFoundError: ignored

In [None]:
device = 'cpu' # cuda

# Tacotron2
ckpt_dict = torch.load('logs/model/acoustic.ckpt', map_location=torch.device(device))
model = Tacotron2()
model.load_state_dict(ckpt_dict['model'])
model = model.eval()

# Vocoder
ckpt_dict = torch.load('logs/model/vocoder.ckpt', map_location=torch.device(device))
vocoder = WaveGlow()
vocoder.load_state_dict(ckpt_dict['model'])
vocoder = vocoder.remove_weightnorm(vocoder)
vocoder.eval()
denoiser = Denoiser(vocoder, 0.1)

korean_cleaner = KoreanCleaner()

In [None]:
# 텍스트 전처리
text = '헬로월드!'

text = korean_cleaner.clean_text(text)

print(text)

In [None]:
# 텍스트를 시쿼스로 변경
sequence = text_to_sequence(text, ['multi_cleaner'])
sequence = to_var(torch.IntTensor(sequence)[None, :]).long()

print(sequence)

In [None]:
# 추론
sigma = 0.5
strength = 10
sample_rate = 22050

with torch.no_grad():
    _, mel_outputs_postnet, linear_outputs, _, alignments = model.inference(sequence)
    wav = vocoder.infer(mel_outputs_postnet, sigma=sigma)

    wav *= 32767. / max(0.01, torch.max(torch.abs(wav)))
    wav = wav.squeeze()
    wav = wav.cpu().detach().numpy().astype('float32')

    wav = denoiser(wav, strength=strength)

wav = np.append(wav, np.array([[0.0] * (sample_rate // 2)]))
audio_duration = librosa.get_duration(wav, sample_rate)

plt.figure(figsize=(14, 5))
librosa.display.waveplot(wav, sr=sample_rate)

In [None]:
# 결과 파일 저장
import IPython.display as ipd
import soundfile as sf

wav_file = wav.astype(np.int16)
# sf.write('temp.wav', wav_file, sample_rate)

ipd.Audio(wav_file, rate=sample_rate)