<a href="https://colab.research.google.com/github/sunwoo02/Pronunciation-Enhancement-Trial/blob/main/Pronouncement_enhance.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 0) Prep For Runtime

In [36]:
import torch, platform, sys, subprocess, os
print("Python:", sys.version)
print("CUDA available:", torch.cuda.is_available())
print("Torch:", torch.__version__)
!nvidia-smi -L || true

Python: 3.12.11 (main, Jun  4 2025, 08:56:18) [GCC 11.4.0]
CUDA available: True
Torch: 2.8.0+cu126
GPU 0: Tesla T4 (UUID: GPU-9b80018d-81e0-3430-c433-db32b58e8828)


# 1) Package Installation

In [37]:
# 최신 pip
!pip -q install --upgrade pip

# 오디오/유틸
!pip -q install soundfile torchaudio numpy scipy unidecode pydub

# Whisper (고속 추론 엔진)
!pip -q install faster-whisper

# ✅ Coqui TTS (XTTS v2 포함) - Python 3.12 호환
!pip -q install coqui-tts

# 2) Input File Upload

In [38]:
from google.colab import files
print("Upload your input speech as 'input.wav' and your reference voice as 'my_voice_ref.wav'")
uploaded = files.upload()

Upload your input speech as 'input.wav' and your reference voice as 'my_voice_ref.wav'


Saving Bad_pronunciation.wav to Bad_pronunciation (2).wav


# 3) Speech to Text

In [39]:
import torch
from faster_whisper import WhisperModel

# 모델 선택
model_name = "medium"  # "small" / "medium" / "large-v3"
device = "cuda" if torch.cuda.is_available() else "cpu"
compute_type = "float16" if torch.cuda.is_available() else "int8"

asr = WhisperModel(model_name, device=device, compute_type=compute_type)

segments, info = asr.transcribe(
    "Bad_pronunciation.wav",
    language="en",        # 영어 위주라면 명시
    vad_filter=True,      # 음성 구간만 사용
    beam_size=5
)

raw_text = " ".join(seg.text.strip() for seg in segments).strip()
print("ASR result:\n", raw_text)

ASR result:
 The annual CES trade show in Las Vegas will undergo significant changes next year regarding the participation of Korean companies, which have long been among the most prominent exhibitors at the world's largest tech fair. Some companies have chosen to skip the event to better focus on their core businesses, while others plan to increase the size of their exhibits.


# 4) Text Adjustment

In [40]:
def tidy_text(t: str) -> str:
    t = " ".join(t.split())  # 공백 정리
    if len(t) > 0 and t[-1] not in ".?!":
        t += "."
    return t

clean_text = tidy_text(raw_text)
print("Clean text:\n", clean_text)

Clean text:
 The annual CES trade show in Las Vegas will undergo significant changes next year regarding the participation of Korean companies, which have long been among the most prominent exhibitors at the world's largest tech fair. Some companies have chosen to skip the event to better focus on their core businesses, while others plan to increase the size of their exhibits.


# 5) Upload Voice Reference

In [41]:
# my_voice_ref.wav (내 목소리 참조 음성) 업로드
from google.colab import files
uploaded_ref = files.upload()  # 여기서 my_voice_ref.wav 올리기

Saving myvoice_refer.wav to myvoice_refer (3).wav


# 6) Text to Enhanced Speech

In [42]:
from TTS.api import TTS
import re
import os
import soundfile as sf
import numpy as np

# 모델 로드 (처음 한 번만 다운로드)
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2")

# 간단 문장 분할: 마침표/물음표/느낌표 기준
def split_sentences(t: str):
    parts = re.split(r'([.?!])', t)
    sents = []
    buf = ""
    for p in parts:
        if p in [".", "?", "!"]:
            sents.append((buf + p).strip())
            buf = ""
        else:
            buf += p
    if buf.strip():
        sents.append(buf.strip())
    # 너무 짧은 조각은 앞에 합치기
    merged = []
    for s in sents:
        if merged and len(s) < 3:
            merged[-1] += " " + s
        else:
            merged.append(s)
    return [s for s in merged if s]

sentences = split_sentences(clean_text)
print("Sentence chunks:", sentences)

# 각 문장을 개별 파일로 합성 후 이어 붙이기
tmp_files = []
for i, s in enumerate(sentences):
    out_path = f"chunk_{i:02d}.wav"
    tts.tts_to_file(
        text=s,
        speaker_wav=["myvoice_refer.wav"],  # 참조 음성
        language="en",
        file_path=out_path
    )
    tmp_files.append(out_path)

# WAVE들을 읽어서 하나의 파일로 합치기
wavs = []
sr = None
for f in tmp_files:
    audio, fs = sf.read(f)
    if sr is None:
        sr = fs
    elif fs != sr:
        raise ValueError("Sampling rate mismatch among chunks.")
    wavs.append(audio)

# 짧은 무음(0.2s) 넣어 자연스러움
gap = np.zeros(int(sr * 0.2), dtype=wavs[0].dtype)
full = []
for i, w in enumerate(wavs):
    full.append(w)
    if i != len(wavs) - 1:
        full.append(gap)
full = np.concatenate(full, axis=0)

sf.write("corrected_in_my_voice.wav", full, sr)
print("Saved:", os.path.abspath("corrected_in_my_voice.wav"))

Sentence chunks: ["The annual CES trade show in Las Vegas will undergo significant changes next year regarding the participation of Korean companies, which have long been among the most prominent exhibitors at the world's largest tech fair.", 'Some companies have chosen to skip the event to better focus on their core businesses, while others plan to increase the size of their exhibits.']


  s = torchaudio.io.StreamReader(src, format, None, buffer_size)


Saved: /content/corrected_in_my_voice.wav


# 7) Result

In [43]:
from IPython.display import Audio
Audio("corrected_in_my_voice.wav")

In [44]:
from google.colab import files
files.download("corrected_in_my_voice.wav")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>