# 🎙️ Chatterbox TTS - Gradio UI in Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/viveksurmay/chatterbox-colab/blob/master/chatterbox_gradio_colab.ipynb)

**Simple Gradio interface for Chatterbox TTS without watermarking**

## Features:
- 🎤 **Text-to-Speech** with voice cloning
- 🔄 **Voice Conversion** 
- 🚫 **No Watermarks** - Clean audio output
- 🎛️ **Simple Controls** - Easy to use interface

---

## 📦 Setup & Installation

In [None]:
# Install dependencies
!pip install gradio torch torchaudio librosa transformers diffusers safetensors numpy s3tokenizer conformer huggingface_hub

print("✅ Installation complete!")

## 🤖 Create No-Watermark Models

In [None]:
# Create the no-watermark TTS class
from dataclasses import dataclass
from pathlib import Path
import librosa
import torch
import torch.nn.functional as F
from huggingface_hub import hf_hub_download
from safetensors.torch import load_file

# We'll create simplified versions inline for Colab
print("📦 Setting up no-watermark models...")

# Import the original classes to modify them
try:
    from chatterbox.tts import ChatterboxTTS, punc_norm, Conditionals
    from chatterbox.vc import ChatterboxVC
    print("✅ Chatterbox models imported successfully")
except ImportError:
    print("❌ Chatterbox not found, installing...")
    !pip install chatterbox-tts
    from chatterbox.tts import ChatterboxTTS, punc_norm, Conditionals
    from chatterbox.vc import ChatterboxVC
    print("✅ Chatterbox models imported successfully")

In [None]:
# Create no-watermark versions by monkey-patching
class ChatterboxTTSNoWatermark(ChatterboxTTS):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Remove watermarker
        self.watermarker = None
    
    def generate(self, *args, **kwargs):
        # Call parent generate but intercept the watermarking
        result = super().generate(*args, **kwargs)
        # The parent method applies watermarking, but we'll return clean audio
        # We need to override the generate method to skip watermarking
        return self._generate_clean(*args, **kwargs)
    
    def _generate_clean(
        self,
        text,
        repetition_penalty=1.2,
        min_p=0.05,
        top_p=1.0,
        audio_prompt_path=None,
        exaggeration=0.5,
        cfg_weight=0.5,
        temperature=0.8,
    ):
        if audio_prompt_path:
            self.prepare_conditionals(audio_prompt_path, exaggeration=exaggeration)
        else:
            assert self.conds is not None, "Please prepare_conditionals first or specify audio_prompt_path"

        # Update exaggeration if needed
        if exaggeration != self.conds.t3.emotion_adv[0, 0, 0]:
            from chatterbox.models.t3.modules.cond_enc import T3Cond
            _cond = self.conds.t3
            self.conds.t3 = T3Cond(
                speaker_emb=_cond.speaker_emb,
                cond_prompt_speech_tokens=_cond.cond_prompt_speech_tokens,
                emotion_adv=exaggeration * torch.ones(1, 1, 1),
            ).to(device=self.device)

        # Norm and tokenize text
        text = punc_norm(text)
        text_tokens = self.tokenizer.text_to_tokens(text).to(self.device)

        if cfg_weight > 0.0:
            text_tokens = torch.cat([text_tokens, text_tokens], dim=0)

        sot = self.t3.hp.start_text_token
        eot = self.t3.hp.stop_text_token
        text_tokens = F.pad(text_tokens, (1, 0), value=sot)
        text_tokens = F.pad(text_tokens, (0, 1), value=eot)

        with torch.inference_mode():
            speech_tokens = self.t3.inference(
                t3_cond=self.conds.t3,
                text_tokens=text_tokens,
                max_new_tokens=1000,
                temperature=temperature,
                cfg_weight=cfg_weight,
                repetition_penalty=repetition_penalty,
                min_p=min_p,
                top_p=top_p,
            )
            speech_tokens = speech_tokens[0]

            from chatterbox.models.s3tokenizer import drop_invalid_tokens
            speech_tokens = drop_invalid_tokens(speech_tokens)
            speech_tokens = speech_tokens[speech_tokens < 6561]
            speech_tokens = speech_tokens.to(self.device)

            wav, _ = self.s3gen.inference(
                speech_tokens=speech_tokens,
                ref_dict=self.conds.gen,
            )
            wav = wav.squeeze(0).detach().cpu().numpy()
            # NO WATERMARKING - return clean audio
        return torch.from_numpy(wav).unsqueeze(0)

class ChatterboxVCNoWatermark(ChatterboxVC):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Remove watermarker
        self.watermarker = None
    
    def generate(self, audio, target_voice_path=None):
        if target_voice_path:
            self.set_target_voice(target_voice_path)
        else:
            assert self.ref_dict is not None, "Please prepare_conditionals first or specify target_voice_path"

        with torch.inference_mode():
            from chatterbox.models.s3tokenizer import S3_SR
            audio_16, _ = librosa.load(audio, sr=S3_SR)
            audio_16 = torch.from_numpy(audio_16).float().to(self.device)[None, ]

            s3_tokens, _ = self.s3gen.tokenizer(audio_16)
            wav, _ = self.s3gen.inference(
                speech_tokens=s3_tokens,
                ref_dict=self.ref_dict,
            )
            wav = wav.squeeze(0).detach().cpu().numpy()
            # NO WATERMARKING - return clean audio
        return torch.from_numpy(wav).unsqueeze(0)

print("✅ No-watermark classes created!")

## 🎛️ Launch Gradio Interface

In [None]:
import gradio as gr

# Device detection
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Using device: {DEVICE}")

# Global models
tts_model = None
vc_model = None

def load_tts_model():
    global tts_model
    if tts_model is None:
        print("🔄 Loading TTS model...")
        tts_model = ChatterboxTTSNoWatermark.from_pretrained(DEVICE)
        print("✅ TTS model loaded!")
    return tts_model

def load_vc_model():
    global vc_model
    if vc_model is None:
        print("🔄 Loading VC model...")
        vc_model = ChatterboxVCNoWatermark.from_pretrained(DEVICE)
        print("✅ VC model loaded!")
    return vc_model

def generate_speech(text, audio_file, exaggeration, cfg_weight, temperature):
    if not text.strip():
        return None, "❌ Please enter some text!"
    
    try:
        model = load_tts_model()
        wav = model.generate(
            text,
            audio_prompt_path=audio_file,
            exaggeration=exaggeration,
            cfg_weight=cfg_weight,
            temperature=temperature,
        )
        return (model.sr, wav.squeeze(0).numpy()), "✅ Speech generated successfully! (No watermark)"
    except Exception as e:
        return None, f"❌ Error: {str(e)}"

def convert_voice(source_audio, target_audio):
    if source_audio is None or target_audio is None:
        return None, "❌ Please upload both source and target audio!"
    
    try:
        model = load_vc_model()
        wav = model.generate(source_audio, target_voice_path=target_audio)
        return (model.sr, wav.squeeze(0).numpy()), "✅ Voice converted successfully! (No watermark)"
    except Exception as e:
        return None, f"❌ Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="🎙️ Chatterbox TTS - No Watermark") as app:
    gr.HTML("""
    <div style="text-align: center; margin-bottom: 20px;">
        <h1>🎙️ Chatterbox TTS</h1>
        <p><strong>Simple Gradio Interface - No Watermarks Applied</strong></p>
        <p><em>Running on: """ + DEVICE + """</em></p>
    </div>
    """)
    
    with gr.Tabs():
        with gr.Tab("🎤 Text-to-Speech"):
            with gr.Row():
                with gr.Column():
                    text_input = gr.Textbox(
                        label="📝 Text to Synthesize",
                        placeholder="Enter your text here...",
                        lines=3,
                        value="Hello! This is Chatterbox TTS generating speech without any watermarks."
                    )
                    audio_input = gr.Audio(
                        label="🎭 Reference Audio (Optional - for voice cloning)",
                        type="filepath"
                    )
                    
                    with gr.Row():
                        exaggeration = gr.Slider(
                            0.25, 2.0, value=0.5, step=0.05,
                            label="🎭 Exaggeration",
                            info="Emotion intensity (0.5=neutral)"
                        )
                        cfg_weight = gr.Slider(
                            0.0, 1.0, value=0.5, step=0.05,
                            label="⚡ CFG Weight",
                            info="Pacing control"
                        )
                        temperature = gr.Slider(
                            0.1, 2.0, value=0.8, step=0.1,
                            label="🌡️ Temperature",
                            info="Randomness control"
                        )
                    
                    generate_btn = gr.Button("🎤 Generate Speech", variant="primary", size="lg")
                
                with gr.Column():
                    audio_output = gr.Audio(label="🔊 Generated Audio")
                    status_output = gr.Textbox(label="Status", interactive=False)
        
        with gr.Tab("🔄 Voice Conversion"):
            with gr.Row():
                with gr.Column():
                    source_audio = gr.Audio(
                        label="📁 Source Audio (speech to convert)",
                        type="filepath"
                    )
                    target_audio = gr.Audio(
                        label="🎯 Target Voice (voice to convert to)",
                        type="filepath"
                    )
                    convert_btn = gr.Button("🔄 Convert Voice", variant="primary", size="lg")
                
                with gr.Column():
                    converted_audio = gr.Audio(label="🔊 Converted Audio")
                    vc_status = gr.Textbox(label="Status", interactive=False)
        
        with gr.Tab("💡 Tips"):
            gr.Markdown("""
            ## 💡 Usage Tips
            
            ### 🎤 Text-to-Speech:
            - **Text**: Enter any text you want to synthesize
            - **Reference Audio**: Upload 3-10 seconds of clear speech for voice cloning
            - **Exaggeration**: 0.5=neutral, 0.7-1.0=expressive, 1.5+=dramatic
            - **CFG Weight**: 0.3=slower, 0.5=balanced, 0.8+=faster
            - **Temperature**: 0.5=consistent, 0.8=balanced, 1.5+=creative
            
            ### 🔄 Voice Conversion:
            - Upload clear audio files for both source and target
            - Keep audio samples between 3-15 seconds
            - Similar speaking styles convert better
            
            ### ✨ Features:
            - **No Watermarks**: All generated audio is clean
            - **GPU Accelerated**: Faster generation when available
            - **High Quality**: 24kHz sample rate output
            
            ---
            **🎉 Enjoy creating amazing voices!**
            """)
    
    # Event handlers
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, audio_input, exaggeration, cfg_weight, temperature],
        outputs=[audio_output, status_output]
    )
    
    convert_btn.click(
        fn=convert_voice,
        inputs=[source_audio, target_audio],
        outputs=[converted_audio, vc_status]
    )

# Launch the app
print("🚀 Launching Gradio interface...")
app.launch(share=True, debug=True)