# F5-TTS Benchmark

Bu notebook F5-TTS engine'ini 5 Türkçe mülakat sorusu ile test eder.

**Engine:** F5-TTS
**GPU:** Gerekli (dinamik tespit)
**Türkçe Desteği:** Evet (multilingual)

**NOT:** Bu notebook Miniconda ile Python 3.11 ortamı oluşturarak çalışır (versiyon uyumu için).

Her soru için hem SORU hem CEVAP seslendiriliyor.

In [None]:
# Cell 1: Mount Drive & Install Miniconda
from google.colab import drive
drive.mount('/content/drive')
print("Drive mounted successfully")

# Install Miniconda
import os
if not os.path.exists('/usr/local/bin/conda'):
    print("\nInstalling Miniconda...")
    !wget -q https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -O /tmp/miniconda.sh
    !bash /tmp/miniconda.sh -b -f -p /usr/local
    !rm /tmp/miniconda.sh
    print("Miniconda installed!")
else:
    print("Miniconda already installed")

!conda --version

In [None]:
# Cell 2: Create Python 3.11 Environment & Install Dependencies
import os

ENV_NAME = "tts_py311"

# Accept Conda ToS (required for non-interactive environments)
print("Accepting Conda Terms of Service...")
!conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main 2>/dev/null || true
!conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r 2>/dev/null || true

# Create environment with conda-forge (avoid default channels)
print("\nCreating Python 3.11 environment...")
!conda create -n $ENV_NAME python=3.11 -c conda-forge --override-channels -y -q

# Install ffmpeg (required for audio processing)
print("\nInstalling ffmpeg...")
!apt-get install -y ffmpeg > /dev/null 2>&1

# Install PyTorch with CUDA
print("\nInstalling PyTorch...")
!source activate $ENV_NAME && pip install -q torch torchaudio --index-url https://download.pytorch.org/whl/cu121

# Install F5-TTS
print("\nInstalling F5-TTS...")
!source activate $ENV_NAME && pip install -q f5-tts

# Install edge-tts (high-quality Microsoft Neural TTS for reference audio)
print("\nInstalling edge-tts...")
!source activate $ENV_NAME && pip install -q edge-tts

# Install monitoring tools
print("\nInstalling monitoring tools...")
!source activate $ENV_NAME && pip install -q psutil pynvml

# Verify installation
print("\n" + "="*60)
print("VERIFICATION:")
!source activate $ENV_NAME && python --version
!source activate $ENV_NAME && python -c "import torch; print('PyTorch:', torch.__version__, 'CUDA:', torch.cuda.is_available()); print('GPU:', torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A')"
!source activate $ENV_NAME && python -c "import f5_tts; print('F5-TTS: OK')"
!source activate $ENV_NAME && python -c "import edge_tts; print('edge-tts: OK')"
print("="*60)

In [None]:
# Cell 3: Setup Output Directories
import os
import shutil

ENGINE_NAME = "f5tts"
MODEL_NAME = "F5-TTS"
ENV_NAME = "tts_py311"

BASE_DIR = f"/content/drive/MyDrive/tts-ms/output/{ENGINE_NAME}"
AUDIO_DIR = f"{BASE_DIR}/audio"

# Clean start
if os.path.exists(BASE_DIR):
    shutil.rmtree(BASE_DIR)
os.makedirs(AUDIO_DIR, exist_ok=True)
print(f"Output directory: {BASE_DIR}")

# Model cache on Drive
MODEL_DIR = "/content/drive/MyDrive/tts-ms/cache/f5tts"
os.makedirs(MODEL_DIR, exist_ok=True)
print(f"Model cache: {MODEL_DIR}")

In [None]:
%%bash
source activate tts_py311

export MPLBACKEND=agg
export LANG=en_US.UTF-8
export LC_ALL=en_US.UTF-8
export PYTHONIOENCODING=utf-8

# Generate high-quality Turkish reference audio with edge-tts
REF_DIR="/content/drive/MyDrive/tts-ms/cache/f5tts/ref"
mkdir -p "$REF_DIR"
rm -f "$REF_DIR"/*.wav "$REF_DIR"/*.mp3 2>/dev/null

echo "Generating Turkish reference audio with edge-tts (Microsoft Neural TTS)..."
edge-tts --voice "tr-TR-EmelNeural" \
    --text "Merhaba, ben bir Türk ses asistanıyım. Bugün sizinle görüşmekten mutluluk duyuyorum." \
    --write-media "$REF_DIR/turkish_ref.mp3"

# Convert to WAV
ffmpeg -y -i "$REF_DIR/turkish_ref.mp3" -ar 24000 -ac 1 "$REF_DIR/turkish_ref.wav" 2>/dev/null
echo "Reference audio ready: $REF_DIR/turkish_ref.wav"

python -u << 'ENDOFPYTHON'
# -*- coding: utf-8 -*-
import os
os.environ["MPLBACKEND"] = "agg"

import json
import time
import psutil
import torch
import torchaudio
from datetime import datetime
from f5_tts.api import F5TTS

# GPU monitoring
try:
    import pynvml
    pynvml.nvmlInit()
    GPU_HANDLE = pynvml.nvmlDeviceGetHandleByIndex(0)
    GPU_AVAILABLE = True
    GPU_NAME = pynvml.nvmlDeviceGetName(GPU_HANDLE)
    if isinstance(GPU_NAME, bytes):
        GPU_NAME = GPU_NAME.decode('utf-8')
except:
    GPU_AVAILABLE = False
    GPU_NAME = "N/A"

# Config
ENGINE_NAME = "f5tts"
MODEL_NAME = "F5-TTS"
BASE_DIR = "/content/drive/MyDrive/tts-ms/output/f5tts"
AUDIO_DIR = f"{BASE_DIR}/audio"
MODEL_DIR = "/content/drive/MyDrive/tts-ms/cache/f5tts"

os.environ["HF_HOME"] = MODEL_DIR

# Reference audio (generated by edge-tts above)
REF_AUDIO = f"{MODEL_DIR}/ref/turkish_ref.wav"
REF_TEXT = "Merhaba, ben bir Türk ses asistanıyım. Bugün sizinle görüşmekten mutluluk duyuyorum."

# Verify
waveform, sr = torchaudio.load(REF_AUDIO)
print(f"Reference: {waveform.shape[1]/sr:.1f}s, {sr}Hz")

process = psutil.Process()
CPU_COUNT = psutil.cpu_count()
resource_logs = []
results = []

def get_gpu_stats():
    if not GPU_AVAILABLE:
        return {"gpu_util": 0, "gpu_mem_used": 0, "gpu_mem_total": 0}
    try:
        util = pynvml.nvmlDeviceGetUtilizationRates(GPU_HANDLE)
        mem = pynvml.nvmlDeviceGetMemoryInfo(GPU_HANDLE)
        return {"gpu_util": util.gpu, "gpu_mem_used": mem.used/1024/1024, "gpu_mem_total": mem.total/1024/1024}
    except:
        return {"gpu_util": 0, "gpu_mem_used": 0, "gpu_mem_total": 0}

def get_resources():
    gpu = get_gpu_stats()
    return {
        "cpu": process.cpu_percent(),
        "ram_mb": process.memory_info().rss/1024/1024,
        "gpu_util": gpu["gpu_util"],
        "gpu_mem_used": gpu["gpu_mem_used"],
        "gpu_mem_total": gpu["gpu_mem_total"]
    }

QUESTIONS = [
    {"id": "01", "question": "Sizi neden işe almalıyız?",
     "answer": "Güçlü analitik düşünme becerilerim ve takım çalışmasına yatkınlığım sayesinde projelere değer katabilirim. Ayrıca sürekli öğrenmeye açık yapım ve problem çözme yeteneklerim, şirketinizin hedeflerine ulaşmasında önemli katkılar sağlayacaktır."},
    {"id": "02", "question": "Siz bizi neden seçtiniz?",
     "answer": "Şirketinizin yenilikçi yaklaşımı ve sektördeki lider konumu beni çok etkiledi. Kariyer hedeflerimle örtüşen bu ortamda kendimi geliştirebileceğime ve anlamlı projeler üzerinde çalışabileceğime inanıyorum."},
    {"id": "03", "question": "Kötü özellikleriniz nelerdir?",
     "answer": "Bazen aşırı detaycı olabiliyorum, bu da zaman yönetimimi olumsuz etkileyebiliyor. Ancak bu özelliğimin farkındayım ve önceliklendirme teknikleri kullanarak bu durumu yönetmeye çalışıyorum."},
    {"id": "04", "question": "Beş yıl sonra kendinizi nerede görüyorsunuz?",
     "answer": "Beş yıl içinde teknik liderlik pozisyonunda olmayı hedefliyorum. Ekip yönetimi deneyimi kazanarak şirketin büyümesine stratejik katkılar sağlamak istiyorum."},
    {"id": "05", "question": "Maaş beklentiniz nedir?",
     "answer": "Piyasa koşullarını ve pozisyonun gerekliliklerini değerlendirerek, deneyimime ve yeteneklerime uygun rekabetçi bir maaş beklentim var. Bu konuda esnek olmaya ve karşılıklı bir anlaşmaya varmaya açığım."}
]

print("="*60)
print("INITIALIZING F5-TTS")
print("="*60)
print(f"GPU: {GPU_NAME}")

res_before = get_resources()
start = time.time()
tts = F5TTS(device="cuda")
init_time = time.time() - start
res_after = get_resources()
resource_logs.append({"stage": "init", "duration": init_time, "cpu": res_after["cpu"], "ram_delta": res_after["ram_mb"]-res_before["ram_mb"], "gpu_util": res_after["gpu_util"], "gpu_mem_delta": res_after["gpu_mem_used"]-res_before["gpu_mem_used"]})
print(f"Initialized in {init_time:.2f}s | VRAM: {res_after['gpu_mem_used']:.0f}MB")

print("\nWarmup...")
res_before = get_resources()
start = time.time()
audio, sr, _ = tts.infer(ref_file=REF_AUDIO, ref_text=REF_TEXT, gen_text="Merhaba, nasılsınız?")
torchaudio.save(f"{AUDIO_DIR}/warmup.wav", torch.tensor(audio).unsqueeze(0), sr)
warmup_time = time.time() - start
res_after = get_resources()
resource_logs.append({"stage": "warmup", "duration": warmup_time, "cpu": res_after["cpu"], "ram_delta": res_after["ram_mb"]-res_before["ram_mb"], "gpu_util": res_after["gpu_util"], "gpu_mem_delta": res_after["gpu_mem_used"]-res_before["gpu_mem_used"]})
print(f"Warmup done in {warmup_time:.2f}s")

print("\n" + "="*60)
print(f"SYNTHESIZING {len(QUESTIONS)} QUESTIONS + ANSWERS")
print("="*60)

for q in QUESTIONS:
    print(f"\n[{q['id']}] {q['question']}")
    for typ, text in [("soru", q["question"]), ("cevap", q["answer"])]:
        print(f"  {typ.upper()}: ", end="", flush=True)
        res_before = get_resources()
        start = time.time()
        try:
            path = f"{AUDIO_DIR}/{q['id']}_{typ}.wav"
            audio, sr, _ = tts.infer(ref_file=REF_AUDIO, ref_text=REF_TEXT, gen_text=text)
            torchaudio.save(path, torch.tensor(audio).unsqueeze(0), sr)
            elapsed = time.time() - start
            res_after = get_resources()
            size = os.path.getsize(path)
            cpu_norm = res_after["cpu"] / CPU_COUNT
            
            resource_logs.append({"stage": f"{q['id']}_{typ}", "text": text, "duration": elapsed, "cpu": res_after["cpu"], "cpu_norm": cpu_norm, "ram_delta": res_after["ram_mb"]-res_before["ram_mb"], "gpu_util": res_after["gpu_util"], "gpu_mem_used": res_after["gpu_mem_used"], "size_kb": size/1024})
            results.append({"id": q["id"], "type": typ, "text": text, "time": elapsed, "size": size, "cpu": cpu_norm, "ram_delta": res_after["ram_mb"]-res_before["ram_mb"], "gpu_util": res_after["gpu_util"], "status": "OK"})
            print(f"{elapsed:.2f}s | {size/1024:.1f}KB | GPU:{res_after['gpu_util']}% | OK")
        except Exception as e:
            results.append({"id": q["id"], "type": typ, "text": text, "time": time.time()-start, "size": 0, "cpu": 0, "ram_delta": 0, "gpu_util": 0, "status": f"FAIL: {e}"})
            print(f"FAIL: {e}")

successful = [r for r in results if r["status"] == "OK"]
print(f"\n{'='*60}\nCOMPLETE: {len(successful)}/{len(results)} successful\n{'='*60}")

output_data = {"engine": ENGINE_NAME, "model": MODEL_NAME, "gpu_name": GPU_NAME, "init_time": init_time, "warmup_time": warmup_time, "results": results, "resource_logs": resource_logs, "timestamp": datetime.now().isoformat()}
with open(f"{BASE_DIR}/results.json", "w", encoding="utf-8") as f:
    json.dump(output_data, f, indent=2, ensure_ascii=False)
print(f"\nResults saved to {BASE_DIR}/results.json")
ENDOFPYTHON

In [None]:
# Cell 5: Play Audio Files
from IPython.display import Audio, display
from pathlib import Path

ENGINE_NAME = "f5tts"
AUDIO_DIR = f"/content/drive/MyDrive/tts-ms/output/{ENGINE_NAME}/audio"

print("="*60)
print("AUDIO PLAYBACK")
print("="*60)

audio_files = sorted(Path(AUDIO_DIR).glob("*.wav"))
audio_files = [f for f in audio_files if f.name != "warmup.wav"]

if not audio_files:
    print("No audio files found!")
else:
    for wav in audio_files:
        print(f"\n{wav.name}:")
        display(Audio(str(wav)))

In [None]:
# Cell 6: Generate Reports
import json
import psutil
from datetime import datetime

ENGINE_NAME = "f5tts"
MODEL_NAME = "F5-TTS"
BASE_DIR = f"/content/drive/MyDrive/tts-ms/output/{ENGINE_NAME}"
CPU_COUNT = psutil.cpu_count()

# Load results
with open(f"{BASE_DIR}/results.json", "r", encoding="utf-8") as f:
    data = json.load(f)

results = data["results"]
resource_logs = data["resource_logs"]
init_time = data["init_time"]
warmup_time = data["warmup_time"]
gpu_name = data.get("gpu_name", "Unknown GPU")

successful = [r for r in results if r["status"] == "OK"]
soru_results = [r for r in successful if r["type"] == "soru"]
cevap_results = [r for r in successful if r["type"] == "cevap"]

total_size_kb = sum(r["size"] for r in successful) / 1024
avg_soru = sum(r["time"] for r in soru_results) / len(soru_results) if soru_results else 0
avg_cevap = sum(r["time"] for r in cevap_results) / len(cevap_results) if cevap_results else 0

synth_logs = [r for r in resource_logs if r["stage"] not in ["init", "warmup"]]
avg_cpu = sum(r.get("cpu_norm", 0) for r in synth_logs) / len(synth_logs) if synth_logs else 0
max_cpu = max(r.get("cpu_norm", 0) for r in synth_logs) if synth_logs else 0
total_ram = sum(r.get("ram_delta", 0) for r in synth_logs)
avg_gpu = sum(r.get("gpu_util", 0) for r in synth_logs) / len(synth_logs) if synth_logs else 0
max_gpu = max(r.get("gpu_util", 0) for r in synth_logs) if synth_logs else 0
max_vram = max(r.get("gpu_mem_used", 0) for r in synth_logs) if synth_logs else 0

# summary.txt
summary = f"""============================================================
TTS BENCHMARK - F5-TTS
============================================================
Tarih: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
Platform: Google Colab (Miniconda Python 3.11)
GPU: {gpu_name}
Model: {MODEL_NAME}

PERFORMANS:
- Init: {init_time:.2f}s
- Warmup: {warmup_time:.2f}s

SONUCLAR:
- Toplam: 5 soru
- Soru Audio: {len(soru_results)}/5 basarili (ort: {avg_soru:.2f}s)
- Cevap Audio: {len(cevap_results)}/5 basarili (ort: {avg_cevap:.2f}s)
- Toplam Audio: {total_size_kb:.1f} KB

KAYNAK KULLANIMI:
- CPU: Ort {avg_cpu:.0f}% / Max {max_cpu:.0f}%
- RAM Delta: {total_ram:+.1f} MB
- GPU: Ort {avg_gpu:.0f}% / Max {max_gpu:.0f}%
- VRAM Max: {max_vram:.0f} MB

DETAYLAR:
"""
for r in results:
    s = "OK" if r["status"] == "OK" else "FAIL"
    text_preview = r["text"][:50] + "..." if len(r["text"]) > 50 else r["text"]
    summary += f"[{r['id']}_{r['type']}] {r['time']:.2f}s | {r['size']/1024:.1f}KB | CPU:{r['cpu']:.0f}% | GPU:{r.get('gpu_util',0)}% | {s}\n"
    summary += f"    Text: {text_preview}\n"
summary += f"\nAudio: {BASE_DIR}/audio\n============================================================\n"

with open(f"{BASE_DIR}/summary.txt", "w", encoding="utf-8") as f:
    f.write(summary)

print(summary)

In [None]:
# Cell 7: Download ZIP
import zipfile
from pathlib import Path
from google.colab import files

ENGINE_NAME = "f5tts"
BASE_DIR = f"/content/drive/MyDrive/tts-ms/output/{ENGINE_NAME}"
AUDIO_DIR = f"{BASE_DIR}/audio"

zip_path = f"{BASE_DIR}/f5tts_benchmark.zip"

with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
    zf.write(f"{BASE_DIR}/summary.txt", "summary.txt")
    zf.write(f"{BASE_DIR}/results.json", "results.json")
    for wav in Path(AUDIO_DIR).glob("*.wav"):
        if wav.name != "warmup.wav":
            zf.write(wav, f"audio/{wav.name}")

print(f"ZIP: {zip_path}")
print(f"Size: {Path(zip_path).stat().st_size/1024:.1f} KB")
print("\nContents:")
with zipfile.ZipFile(zip_path, "r") as zf:
    for f in zf.namelist():
        print(f"  {f}")

files.download(zip_path)