- 한국어 음성 —> STT —> translate_ko_to_en() —> tts_english() —> 번역된 영어음성

In [15]:
# 필요한 패키지 설치
# !pip install transformers accelerate soundfile librosa
# 1) 의존성 설치
!pip install gTTS
!pip install playsound==1.2.2

# 2) TTS 변환 및 저장
from gtts import gTTS
from IPython.display import Audio

import warnings
warnings.filterwarnings("ignore")

import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
from transformers import MarianMTModel, MarianTokenizer
import soundfile as sf
import librosa
import numpy as np
from base64 import b64decode
from google.colab.output import eval_js
from IPython.display import display
import time

# 1) 모델과 프로세서 로드
processor = WhisperProcessor.from_pretrained("openai/whisper-small")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# 2) 파일 단위 인식 함수
def transcribe_file(path): #음성 파일을 입력받아 그것을 출력하도록 하는 함수
    try:
        audio, sr = sf.read(path)
    except:
        audio, sr = librosa.load(path, sr=16000)  #wave form내 8kHz 안에 있는 data 보겠다.
    inputs = processor(audio, sampling_rate=sr, return_tensors="pt", padding=True)
    input_features = inputs.input_features.to(device)

    # Whisper가 기대하는 입력 길이 계산(최대 30s)
    encoder = model.model.encoder
    stride1 = encoder.conv1.stride[0]
    stride2 = encoder.conv2.stride[0]
    expected_len = model.config.max_source_positions * stride1 * stride2

    seq_len = input_features.shape[-1]
    if seq_len < expected_len:
        pad_len = expected_len - seq_len
        input_features = torch.nn.functional.pad(input_features, (0, pad_len))
    elif seq_len > expected_len:
        input_features = input_features[..., :expected_len]

    # attention_mask 생성
    attention_mask = torch.ones(input_features.shape[:-1], device=device)

    with torch.no_grad():
        predicted_ids = model.generate(
            input_features=input_features,
            attention_mask=attention_mask
        )
    return processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]

# 3) 마이크 권한 요청
print("마이크 권한 요청 중…")
eval_js("""
  navigator.mediaDevices.getUserMedia({ audio: true })
    .then(stream => stream.getTracks().forEach(t => t.stop()))
    .catch(e => console.error(e));
""")

# 4) 녹음 및 인식 함수 (UI 개선)
def record_and_transcribe_ui(filename="recorded_audio.webm", duration=3):
    status = display("준비 중", display_id=True)
    status.update("🔴 녹음 중")
    js = f"""
    async function recordAudio() {{
      const stream = await navigator.mediaDevices.getUserMedia({{ audio: true }});
      const recorder = new MediaRecorder(stream);
      const chunks = [];
      recorder.ondataavailable = e => chunks.push(e.data);
      recorder.start();
      await new Promise(r => setTimeout(r, {duration * 1000}));
      recorder.stop();
      await new Promise(r => recorder.onstop = r);
      const blob = new Blob(chunks);
      const reader = new FileReader();
      reader.readAsDataURL(blob);
      return await new Promise(res => reader.onloadend = () => res(reader.result.split(',')[1]));
    }}
    recordAudio();
    """
    try:
        b64data = eval_js(js)
        audio_bytes = b64decode(b64data)
        with open(filename, "wb") as f:
            f.write(audio_bytes)

        # 인식 진행 표시
        for i in range(6):
            status.update("⌛ 인식 중" + "." * ((i % 3) + 1))
            time.sleep(0.5)

        result = transcribe_file(filename)
        status.update(f"✅ 인식 결과: {result}")
        return result

    except Exception as e:
        status.update(f"⚠️ 오류 발생: {e}")
        raise


마이크 권한 요청 중…


In [25]:

import IPython.display as ipd

# 한국어 → 영어 번역 모델명
model_name = 'Helsinki-NLP/opus-mt-ko-en'

# 토크나이저와 모델 불러오기
tokenizer = MarianTokenizer.from_pretrained(model_name)
model_en = MarianMTModel.from_pretrained(model_name) #위에서 model이라는 이름으로 WhisperForConditionalGeneration.from_pretrained("openai/whisper-small") 받았기에 이름을 다르게 해야한다.

def translate_ko_to_en(text): #한국어를 영어로 번역
    # 토큰화  -> text를 token으로 쪼갠다.
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    # 번역 생성 -> 각 token에 대한 번역을 생성하되, transformer기반이므로 이전 token을 참고해서 번역한다.
    translated = model_en.generate(**inputs)
    # 디코딩  -> 번역한 각 임베딩 벡터를 한국어로 변환한다.
    translated_text = tokenizer.decode(translated[0], skip_special_tokens=True)
    return translated_text

def tts_english(text, filename="output_en.mp3"):  #영어 text를 발음하는 음성파일 생성
    tts = gTTS(text=text, lang='en')  #영어 음성파일을 gTTS모델 생성한다.
    tts.save(filename)
    return filename

# --- 테스트 문장 ---
ko_text=record_and_transcribe_ui(duration=3)
print(ko_text)

# 1) 번역
en_text = translate_ko_to_en(ko_text)
print(f"한국어 입력: {ko_text}")
print(f"영어 번역: {en_text}")

# 2) 영어 음성 합성 및 재생
mp3_file = tts_english(en_text)
ipd.Audio(mp3_file)

'✅ 인식 결과:  the'

 the
한국어 입력:  the
영어 번역: 主 は 言 わ れ た ,


한국어 음성-> STT -> mbart-large-50-many-to-many-mmt -> translate_ko_to_ja_mbart -> 번역된 일본어 음성

In [6]:
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast
import IPython.display as ipd

model_name = "facebook/mbart-large-50-many-to-many-mmt"

tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model_jap = MBartForConditionalGeneration.from_pretrained(model_name)

def translate_ko_to_ja_mbart(text):
    tokenizer.src_lang = "ko_KR"
    encoded = tokenizer(text, return_tensors="pt")
    generated_tokens = model_jap.generate(**encoded, forced_bos_token_id=tokenizer.lang_code_to_id["ja_XX"])
    return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

ko_text=record_and_transcribe_ui(duration=3)
print(ko_text)
ja_text = translate_ko_to_ja_mbart(ko_text)
print("한국어:", ko_text)
print("일본어:", ja_text)

tts = gTTS(text=ja_text, lang='ja')
tts.save("japanese.mp3")
ipd.Audio("japanese.mp3",autoplay="true")

'✅ 인식 결과:  당신을 좋아합니다.'

 당신을 좋아합니다.
한국어:  당신을 좋아합니다.
일본어: あなたを愛しています。


한국어 음성-> STT ->Helsinki-NLP/opus-mt-ko-fr->translate_ko_to_fr-> 번역된 프랑스음성

In [12]:
from transformers import MarianMTModel, MarianTokenizer
import IPython.display as ipd

model_name = 'Helsinki-NLP/opus-mt-ko-fr'
tokenizer = MarianTokenizer.from_pretrained(model_name)
model_fr = MarianMTModel.from_pretrained(model_name)

def translate_ko_to_fr(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True)
    outputs = model_fr.generate(**inputs)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

ko_text=record_and_transcribe_ui(duration=3)
print(ko_text)
fr_text = translate_ko_to_fr(ko_text)
print(f"한국어: {ko_text}")
print(f"프랑스어: {fr_text}")

tts = gTTS(text=fr_text, lang='fr')
tts.save("france.mp3")
ipd.Audio("france.mp3",autoplay="true")

'✅ 인식 결과:  야, 니는 사장 나오라고 그래'

 야, 니는 사장 나오라고 그래
한국어:  야, 니는 사장 나오라고 그래
프랑스어: Dis-lui de partir.


이번에는 한국어 음성을 베트남어 text로 번역 후 베트남 음성출력

In [16]:
import IPython.display as ipd
from transformers import MBartForConditionalGeneration, MBart50TokenizerFast

model_name = "facebook/mbart-large-50-many-to-many-mmt"
tokenizer = MBart50TokenizerFast.from_pretrained(model_name)
model_vi = MBartForConditionalGeneration.from_pretrained(model_name)

def translate_ko_to_vi_mbart(text):
    tokenizer.src_lang = "ko_KR"
    encoded = tokenizer(text, return_tensors="pt", padding=True)
    generated_tokens = model_vi.generate(
        **encoded,
        forced_bos_token_id=tokenizer.lang_code_to_id["vi_VN"],
        max_length=100,
        num_beams=5,
        early_stopping=True
    )
    return tokenizer.decode(generated_tokens[0], skip_special_tokens=True)

ko_text = record_and_transcribe_ui(duration=3)
vi_text = translate_ko_to_vi_mbart(ko_text)
print("한국어:", ko_text)
print("베트남어:", vi_text)

tts = gTTS(text=vi_text, lang='vi')
tts.save("vietnam.mp3")
ipd.Audio("vietnam.mp3",autoplay="true")

'✅ 인식 결과:  여기는 베트남입니다.'

한국어:  여기는 베트남입니다.
베트남어: Đây là Vietnam.


FastSpeech2실습

In [17]:
%cd /content
!wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
!tar -xjf LJSpeech-1.1.tar.bz2

/content
--2025-07-24 12:37:57--  https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
Resolving data.keithito.com (data.keithito.com)... 169.150.249.163, 2400:52e0:1a01::953:1
Connecting to data.keithito.com (data.keithito.com)|169.150.249.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2748572632 (2.6G) [text/plain]
Saving to: ‘LJSpeech-1.1.tar.bz2’


2025-07-24 12:39:09 (36.7 MB/s) - ‘LJSpeech-1.1.tar.bz2’ saved [2748572632/2748572632]



In [18]:
# 1. PyTorch 최신 설치 및 런타임 재시작
# !pip uninstall -y torch torchvision torchaudio
# import os
# os.kill(os.getpid(), 9)

# 2. 필요 라이브러리 설치
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q torch torchvision torchaudio
!pip install -q librosa numpy matplotlib unidecode praat-parselmouth

Looking in indexes: https://download.pytorch.org/whl/cu118
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading https://download.pytorch.org/whl/cu118/torch-2.7.1%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting sympy>=1.13.3 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_nvrtc_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (23.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.2/23.2 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu11==11.8.89 (from torch)
  Downloading https://download.pytorch.org/whl/cu118/nvidia_cuda_runtime_cu11-11.8.89-py3-none-manylinux1_x86_64.whl (875 kB)
[2K     [90m━━━━━━━━━━━━━━━━

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.8/235.8 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m108.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# 필수 설치
#!pip install -q librosa matplotlib unidecode praat-parselmouth

import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import unidecode

# 하이퍼파라미터
SAMPLE_RATE = 22050
N_MELS = 80
N_FFT = 1024
HOP_LENGTH = 256
EMBED_DIM = 256
N_HEADS = 2
FF_DIM = 512
N_LAYERS = 4
BATCH_SIZE = 8
EPOCHS = 3

symbols = list("abcdefghijklmnopqrstuvwxyz ")
symbol_to_id = {s: i + 1 for i, s in enumerate(symbols)}

def text_to_seq(text):
  #입력 text를 1차원 tensor로 변환한다. -> 1차원 벡터로 변환
  #text를 소문자로 변환하고 그 소문자에 대한 ascii code값으로 변환하여 각 값을 1차원 tensor내에 저장
    text = unidecode.unidecode(text.lower())
    return [symbol_to_id.get(c, 0) for c in text if c in symbol_to_id]

def dummy_durations(seq_len):
  #text_to_seq함수에서 입력 text를 1차원 tensor로 변환한 것을 가지고
  #거기에 맞춰 모든 음소(소리의 최소 단위)의 길이를 임의의 고정값으로 설정한 더미길이 sequence를 생성한다.
  #즉 음소의 길이를 text_to_seq함수에서 출력한 1차원 tensor의 길이로 고정한다.
  #
  #
  #dummy_durations(seq_len) 함수는 text_to_seq 함수가 반환하는 음소 시퀀스의 길이(seq_len)에 맞추어,
  #모든 음소의 지속 시간을 동일한 고정값으로 설정한 1차원 텐서를 생성한다고 정리하면 될거 같습니다.
    return torch.full((seq_len,), 5, dtype=torch.long)

def extract_pitch_energy(wav_path):
  #wav를 입력받아 pitch와 energy특징를 추출한다.
    y, sr = librosa.load(wav_path, sr=SAMPLE_RATE)
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr, hop_length=HOP_LENGTH)
    pitch = []
    for i in range(pitches.shape[1]):
        index = magnitudes[:, i].argmax()
        pitch_val = pitches[index, i]
        pitch.append(pitch_val)
    pitch = np.array(pitch)
    energy = librosa.feature.rms(y=y, hop_length=HOP_LENGTH)[0]
    return pitch, energy

class LJSpeechDataset(Dataset):
    def __init__(self, metadata_path, data_dir):
        self.data_dir = data_dir
        self.samples = []
        with open(metadata_path, 'r', encoding='utf-8') as f:
            for line in f:
                parts = line.strip().split('|')
                wav_path = os.path.join(data_dir, 'wavs', parts[0] + '.wav')
                text = parts[1]
                self.samples.append((wav_path, text))
    def __len__(self):
        return len(self.samples)
    def __getitem__(self, idx):
        wav_path, text = self.samples[idx]
        seq = torch.tensor(text_to_seq(text), dtype=torch.long)
        y, sr = librosa.load(wav_path, sr=SAMPLE_RATE)
        mel = librosa.feature.melspectrogram(y=y, sr=sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS)
        mel_db = librosa.power_to_db(mel, ref=np.max).T
        mel = torch.tensor(mel_db, dtype=torch.float32)
        durations = dummy_durations(len(seq))
        pitch_np, energy_np = extract_pitch_energy(wav_path)
        pitch = torch.tensor(pitch_np, dtype=torch.float32)
        energy = torch.tensor(energy_np, dtype=torch.float32)
        return seq, mel, durations, pitch, energy, text

def collate_fn(batch):
  #여러 개의 개별 데이터 샘플(text ID sequence, mel-spectrogram, length, pitch, energy)를
  #가장 긴 샘플에 맞춰 0으로 padding하여, 모든 샘플의 길이가 동일하도록 한다.
  # padding한다는 것은 붙인다는 것이다. 즉 길이가 짧은 것은 0으로 채워 길이를 늘린다.
    seqs, mels, durations, pitches, energies, texts = zip(*batch)
    seqs_pad = nn.utils.rnn.pad_sequence(seqs, batch_first=True, padding_value=0)
    mels_pad = nn.utils.rnn.pad_sequence(mels, batch_first=True)
    durations_pad = nn.utils.rnn.pad_sequence(durations, batch_first=True)
    pitches_pad = nn.utils.rnn.pad_sequence(pitches, batch_first=True)
    energies_pad = nn.utils.rnn.pad_sequence(energies, batch_first=True)
    return seqs_pad, mels_pad, durations_pad, pitches_pad, energies_pad, texts

class TransformerBlock(nn.Module):
  #Transformer
    def __init__(self, embed_dim, num_heads, ff_dim, dropout=0.1):
        super().__init__()
        self.attn = nn.MultiheadAttention(embed_dim, num_heads, dropout=dropout)#입력 sequence내의 다른 위치에 attention하여 정보를 교환
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(nn.Linear(embed_dim, ff_dim), nn.ReLU(), nn.Linear(ff_dim, embed_dim))#feed-forward layer
        self.norm2 = nn.LayerNorm(embed_dim)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        attn_out, _ = self.attn(x, x, x)
        x = self.norm1(x + self.dropout(attn_out))
        ff_out = self.ff(x)
        x = self.norm2(x + self.dropout(ff_out))
        return x

def length_regulator(x, durations):
  #encoder에서 나온 음소별 은닉 표현을 예측된 길이에 맞춰 시간 축으로 확장
  #text token의 길이가 mel-spectrogram frame의 길이로 변환된다.
  #mel spectrogram frame의 길이==25ms
  #text token은 임베딩을 위해 입력할 수 있는 최소단위 이 길이가 frame의 길이로 변환된다.
    outs = []
    for b in range(x.size(0)):
        max_len = min(x.size(1), durations.size(1))
        expanded = [x[b, i].unsqueeze(0).repeat(durations[b, i], 1) for i in range(max_len)]
        outs.append(torch.cat(expanded, dim=0))
    max_len = max([o.size(0) for o in outs])
    outs_padded = [
        torch.cat([o, torch.zeros(max_len - o.size(0), x.size(2), device=x.device)], dim=0)
        for o in outs
    ]
    return torch.stack(outs_padded)

class Predictor(nn.Module):
  #duration(길이), pitch, energy값을 예측하는 module
    def __init__(self, embed_dim):
        super().__init__()
        self.layer = nn.Sequential(nn.Linear(embed_dim, embed_dim), nn.ReLU(), nn.Linear(embed_dim, 1))
    def forward(self, x):
        return self.layer(x).squeeze(-1)

class FastSpeech2(nn.Module):
    def __init__(self, vocab_size, embed_dim, n_heads, ff_dim, n_layers, mel_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)#text음소를 n차원 vector로 변환
        self.pos_embedding = nn.Parameter(torch.randn(1000, embed_dim))#학습 가능한 position embedding으로 변환
        self.encoder_layers = nn.ModuleList([TransformerBlock(embed_dim, n_heads, ff_dim) for _ in range(n_layers)])#Transformer
        self.duration_predictor = Predictor(embed_dim)#duration 예측 module
        self.pitch_predictor = Predictor(embed_dim)#pitch 예측 module
        self.energy_predictor = Predictor(embed_dim)#energy 예측 module
        self.mel_linear = nn.Linear(embed_dim, mel_dim)
    def forward(self, x, durations=None, pitch=None, energy=None):
        b, seq_len = x.size()
        x = self.embedding(x) + self.pos_embedding[:seq_len]
        x = x.transpose(0, 1)
        for layer in self.encoder_layers:
            x = layer(x)
        x = x.transpose(0, 1)
        duration_pred = self.duration_predictor(x)
        pitch_pred = self.pitch_predictor(x)
        energy_pred = self.energy_predictor(x)
        if durations is not None:
            x_expanded = length_regulator(x, durations)
        else:
            x_expanded = x
        if pitch is not None:
            pitch_expanded = length_regulator(pitch.unsqueeze(-1), durations).squeeze(-1)
            pitch_expanded = pitch_expanded.to(x_expanded.device)
            x_expanded = x_expanded + pitch_expanded.unsqueeze(-1)
        if energy is not None:
            energy_expanded = length_regulator(energy.unsqueeze(-1), durations).squeeze(-1)
            energy_expanded = energy_expanded.to(x_expanded.device)
            x_expanded = x_expanded + energy_expanded.unsqueeze(-1)
        mel_output = self.mel_linear(x_expanded)
        return mel_output, duration_pred, pitch_pred, energy_pred

# Downsample pitch/energy 라벨 (멜 프레임→토큰 단위)
def downsample_label(label, durations, max_len):
  #mel-spectrogram frame단위로 추출된 pitch나 energy label를 token길이에 맞춰 해상도를 줄인다.
    out = []
    idx = 0
    for d in durations:
        d = d.item()
        if idx + d > len(label):
            segment = label[idx:]
        else:
            segment = label[idx:idx+d]
        if len(segment) == 0:
            val = 0.0
        else:
            val = segment.float().mean().item()
        out.append(val)
        idx += d
    while len(out) < max_len:
        out.append(0.0)
    return torch.tensor(out[:max_len], dtype=torch.float32)

# 데이터 경로 설정
DATA_PATH = '/content/LJSpeech-1.1'
METADATA_PATH = os.path.join(DATA_PATH, 'metadata.csv')

dataset = LJSpeechDataset(METADATA_PATH, DATA_PATH)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = FastSpeech2(len(symbols)+1, EMBED_DIM, N_HEADS, FF_DIM, N_LAYERS, N_MELS).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
criterion = nn.MSELoss()

# 학습 루프
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0
    for i, (texts, mels, durations, pitches, energies, _) in enumerate(loader):
        texts, mels = texts.to(device), mels.to(device)
        durations = durations.to(device)

        # pitch, energy 라벨을 토큰 길이에 맞게 다운샘플링
        pitch_ds = []
        energy_ds = []
        max_len = texts.size(1)
        for b in range(texts.size(0)):
            pitch_ds.append(downsample_label(pitches[b], durations[b], max_len))
            energy_ds.append(downsample_label(energies[b], durations[b], max_len))
        pitch_ds = torch.stack(pitch_ds).to(device)
        energy_ds = torch.stack(energy_ds).to(device)

        optimizer.zero_grad()
        mel_pred, dur_pred, pitch_pred, energy_pred = model(texts, durations, pitch_ds, energy_ds)
        min_len = min(mel_pred.size(1), mels.size(1))
        mel_loss = criterion(mel_pred[:, :min_len, :], mels[:, :min_len, :])
        dur_loss = criterion(dur_pred, durations.float())
        pitch_loss = criterion(pitch_pred, pitch_ds)
        energy_loss = criterion(energy_pred, energy_ds)
        loss = mel_loss + dur_loss + pitch_loss + energy_loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        if i % 10 == 0:
            print(f"Epoch {epoch+1} Batch {i} Loss {loss.item():.4f}")
    print(f"Epoch {epoch+1} Average Loss: {total_loss / len(loader):.4f}")

# 추론 함수
@torch.no_grad()
def synthesize(model, text, device='cpu'):
    model.eval()
    seq = torch.tensor(text_to_seq(text), dtype=torch.long).unsqueeze(0).to(device)
    x = model.embedding(seq) + model.pos_embedding[:seq.size(1)]
    x = x.transpose(0, 1)
    for layer in model.encoder_layers:
        x = layer(x)
    x = x.transpose(0, 1)
    duration_pred = model.duration_predictor(x).cpu().numpy()[0]
    durations = [max(1, int(round(d))) for d in duration_pred]
    x_expanded = length_regulator(x, torch.tensor(durations).unsqueeze(0).to(device))
    pitch = torch.zeros_like(x_expanded[:, :, 0])
    energy = torch.zeros_like(x_expanded[:, :, 0])
    x_expanded = x_expanded + pitch.unsqueeze(-1) + energy.unsqueeze(-1)
    mel_output = model.mel_linear(x_expanded)
    return mel_output.squeeze(0).cpu().numpy()

# Griffin-Lim 음성 복원
def griffin_lim(mel_spec, n_iter=50, n_fft=1024, hop_length=256, win_length=1024):
  #생성된 mel-spectrogram을 audio wave로 복원
    S = librosa.db_to_power(mel_spec.T)
    mel_basis = librosa.filters.mel(sr=SAMPLE_RATE, n_fft=n_fft, n_mels=N_MELS)
    inv_mel_basis = np.linalg.pinv(mel_basis)
    S_linear = np.maximum(1e-10, np.dot(inv_mel_basis, S))
    y = librosa.griffinlim(S_linear, n_iter=n_iter, hop_length=hop_length, win_length=win_length)
    return y

# 테스트 추론 및 시각화
text_input = "Hello world this is a test of fast speech two synthesis."
mel_pred = synthesize(model, text_input, device=device)

plt.figure(figsize=(10,4))
librosa.display.specshow(mel_pred.T, sr=SAMPLE_RATE, hop_length=HOP_LENGTH, x_axis='time', y_axis='mel')
plt.colorbar(format='%+2.0f dB')
plt.title('Synthesized Mel Spectrogram')
plt.show()

audio = griffin_lim(mel_pred, n_iter=60)
from IPython.display import Audio
Audio(audio, rate=SAMPLE_RATE)

Epoch 1 Batch 0 Loss 443876.2812
Epoch 1 Batch 10 Loss 459839.8438
Epoch 1 Batch 20 Loss 363909.3125
Epoch 1 Batch 30 Loss 369606.6250
Epoch 1 Batch 40 Loss 282505.8438
Epoch 1 Batch 50 Loss 469572.7500
Epoch 1 Batch 60 Loss 285278.9375
Epoch 1 Batch 70 Loss 261589.6406
Epoch 1 Batch 80 Loss 319483.3125
Epoch 1 Batch 90 Loss 146396.6562
Epoch 1 Batch 100 Loss 329259.8438
Epoch 1 Batch 110 Loss 251126.2969
Epoch 1 Batch 120 Loss 303994.3125
Epoch 1 Batch 130 Loss 200452.8281
Epoch 1 Batch 140 Loss 254427.7031
Epoch 1 Batch 150 Loss 313274.8125
Epoch 1 Batch 160 Loss 233848.0156
Epoch 1 Batch 170 Loss 364251.8438
Epoch 1 Batch 180 Loss 258697.3750
Epoch 1 Batch 190 Loss 224994.0000
Epoch 1 Batch 200 Loss 211120.4062
Epoch 1 Batch 210 Loss 357495.2500
Epoch 1 Batch 220 Loss 391221.7500
Epoch 1 Batch 230 Loss 294065.5312
Epoch 1 Batch 240 Loss 227706.1875
Epoch 1 Batch 250 Loss 281358.2188
Epoch 1 Batch 260 Loss 372028.5000
Epoch 1 Batch 270 Loss 271386.2812
Epoch 1 Batch 280 Loss 317206.4