# Chapter 10 — Deployment & Optimization

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import time
import json
import os

## 1. Quantization: How Q4 Compresses the Model

Linear quantization maps float weights to integers:
$$q = \text{round}\left(\frac{x - x_{\min}}{x_{\max} - x_{\min}} \cdot (2^b - 1)\right), \quad x_{\text{approx}} = q \cdot s + x_{\min}$$

In [None]:
def quantize(weights, bits):
    w_min, w_max = weights.min(), weights.max()
    n_levels = 2**bits
    scale = (w_max - w_min) / (n_levels - 1)
    q = np.round((weights - w_min) / scale).astype(np.int32)
    deq = q * scale + w_min
    return deq, np.abs(weights - deq)

np.random.seed(42)
weights = np.random.randn(1000) * 0.05

fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for ax, bits in zip(axes, [8, 4, 2]):
    deq, err = quantize(weights, bits)
    ax.scatter(weights, deq, alpha=0.15, s=2)
    ax.plot([-0.2, 0.2], [-0.2, 0.2], 'r--', lw=1, label="Perfect")
    ax.set_title(f"Q{bits}: max error={err.max():.5f}")
    ax.set_xlabel("Original"); ax.set_ylabel("Dequantized")
    ax.legend(fontsize=8)
plt.suptitle("Weight Quantization — Q4 is nearly lossless for TTS")
plt.tight_layout(); plt.show()

## 2. Memory Footprint by Format

In [None]:
rows = []
for model_name, params_M in [("VieNeu-TTS-0.3B", 300), ("VieNeu-TTS-0.5B", 500)]:
    for fmt, bits in [("fp32", 32), ("fp16", 16), ("Q8", 8), ("Q4", 4)]:
        gb = params_M * 1e6 * (bits/8) / 1e9
        total = gb + 0.05  # codec
        rows.append({
            "Model": model_name, "Format": fmt,
            "Backbone (GB)": f"{gb:.2f}",
            "Total w/ codec": f"{total:.2f}",
            "Fits 8GB RAM": "Yes" if total < 8 else "No",
            "Fits 16GB RAM": "Yes" if total < 16 else "No",
        })

df = pd.DataFrame(rows)
print(df.to_string(index=False))

## 3. RTF Benchmark

In [None]:
import sys
sys.path.insert(0, "..")
from vieneu import Vieneu

test_texts = [
    "Xin ch\u00e0o.",
    "H\u00f4m nay th\u1eddi ti\u1ebft r\u1ea5t \u0111\u1eb9p, t\u00f4i mu\u1ed1n \u0111i d\u1ea1o.",
    "Tr\u00ed tu\u1ec7 nh\u00e2n t\u1ea1o \u0111ang ph\u00e1t tri\u1ec3n nhanh ch\u00f3ng v\u00e0 thay \u0111\u1ed5i nhi\u1ec1u ng\u00e0nh c\u00f4ng nghi\u1ec7p.",
]

tts = Vieneu()
results = []
for text in test_texts:
    times = []
    for _ in range(2):
        t0 = time.time()
        audio = tts.infer(text)
        times.append(time.time() - t0)
    avg = np.mean(times)
    dur = len(audio) / 24000
    rtf = avg / dur
    results.append({
        "Text": text[:45] + ("..." if len(text)>45 else ""),
        "Audio (s)": f"{dur:.2f}",
        "Infer (s)": f"{avg:.2f}",
        "RTF": f"{rtf:.2f}x",
        "Real-time": "Yes" if rtf < 1.0 else "No"
    })
tts.close()

df = pd.DataFrame(results)
print(df.to_string(index=False))
print("\nRTF < 1.0 = faster than real-time")

## 4. Streaming Inference — First-Chunk Latency

In [None]:
from vieneu import Vieneu
import numpy as np
from IPython.display import Audio, display

tts = Vieneu()
text = "Tr\u00ed tu\u1ec7 nh\u00e2n t\u1ea1o \u0111ang thay \u0111\u1ed5i c\u00e1ch ch\u00fang ta giao ti\u1ebfp h\u00e0ng ng\u00e0y."
voice = tts.get_preset_voice("Binh")

chunks = []
chunk_times = []
t_start = time.time()

for chunk in tts.infer_stream(text, voice=voice):
    t_chunk = time.time() - t_start
    chunk_times.append(t_chunk)
    chunks.append(chunk)
    print(f"Chunk {len(chunks):2d}: {len(chunk)/24000:.2f}s audio at t={t_chunk:.2f}s")

full_audio = np.concatenate(chunks)
print(f"\nFirst-chunk latency: {chunk_times[0]:.2f}s")
print(f"Total audio: {len(full_audio)/24000:.2f}s")
display(Audio(full_audio, rate=24000))
tts.close()

## 5. Packaging voices.json

In [None]:
import torch, librosa
from neucodec import DistillNeuCodec

device = "cuda" if torch.cuda.is_available() else "cpu"
codec = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec").to(device).eval()

ref_path = "../examples/audio_ref/example.wav"
ref_text = open("../examples/audio_ref/example.txt").read().strip()

wav, sr = librosa.load(ref_path, sr=16000, mono=True)
wav_t = torch.from_numpy(wav).float().unsqueeze(0).unsqueeze(0).to(device)
with torch.no_grad():
    codes = codec.encode_code(wav_t).squeeze().cpu().numpy().flatten().tolist()

voices_json = {
    "default_voice": "example_voice",
    "presets": {
        "example_voice": {
            "codes": [int(c) for c in codes],
            "text": ref_text,
            "description": "Example Vietnamese voice"
        }
    }
}

print("voices.json structure:")
print(f"  default_voice: {voices_json['default_voice']}")
print(f"  text: {ref_text[:60]}...")
print(f"  codes[:10]: {voices_json['presets']['example_voice']['codes'][:10]}")
print(f"  total tokens: {len(codes)}")
print(f"\nWith this file, users can run:")
print("  tts = Vieneu(backbone_repo='your-username/your-model')")
print("  audio = tts.infer('Xin chao!')  # no ref_audio needed")

## 6. Production Deployment Checklist

| Category | Check |
|----------|-------|
| **Model** | Use GGUF Q4 for CPU deployment |
| **Model** | Merge LoRA before distributing |
| **Model** | Bundle voices.json on HuggingFace |
| **Quality** | Test all 6 Vietnamese tones |
| **Quality** | Test code-switching (Vietnamese + English) |
| **Quality** | UTMOS > 3.8 on test sentences |
| **Quality** | Human MOS on 30+ sentences |
| **Performance** | RTF < 1.0 on target hardware |
| **Serving** | Use streaming for conversational apps |
| **Serving** | Set max_chars=256 for natural chunking |

## Course Complete!

You have now covered the full end-to-end VieNeu-TTS pipeline:

```
Audio Fundamentals → Text Processing → TTS Architectures → Neural Codecs
                                                                    ↓
                                                           LLM-Based TTS
                                                           ↙           ↘
                                                  Voice Cloning    LoRA Theory
                                                                        ↓
                                                               Data Preparation
                                                                        ↓
                                                          Training & Evaluation
                                                                        ↓
                                                                  Deployment
```

**Next steps:**
1. Collect your Vietnamese audio dataset (Chapter 8)
2. Run the fine-tuning pipeline (Chapter 9)
3. Deploy your custom model (Chapter 10)
4. Contribute to the VieNeu-TTS community!