In [1]:
# Install system dependencies
!apt update && apt install -y espeak-ng

# Install UV package manager
!pip install -U uv

# Clone Zonos repository
!git clone https://github.com/Zyphra/Zonos.git
%cd Zonos

# Install dependencies using UV
!uv sync

!pip install kanjize phonemizer sudachipy sudachidict_full

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[37m⠙[0m [2mPreparing packages...[0m (100/108)
[2mnvidia-nccl-cu12         [0m [32m----------------------------[2m--[0m[0m 165.48 MiB/179.91 MiB
[2mnvidia-cusparse-cu12     [0m [32m-------------------------[2m-----[0m[0m 161.28 MiB/197.84 MiB
[2mnvidia-cufft-cu12        [0m [32m------------------------[2m------[0m[0m 160.81 MiB/201.66 MiB
[2mtriton                   [0m [32m---------------------[2m---------[0m[0m 164.68 MiB/241.43 MiB
[2mnvidia-cublas-cu12       [0m [32m---------------[2m---------------[0m[0m 163.06 MiB/346.60 MiB
[2mnvidia-cudnn-cu12        [0m [32m--------[2m----------------------[0m[0m 162.05 MiB/633.96 MiB
[2K[9A   [36m[1mBuilding[0m[39m sudachidict-full[2m==20250129[0m
      [32m[1mBuilt[0m[39m pylatexenc[2m==2.10[0m
[37m⠙[0m [2mPreparing packages...[0m (100/108)
[2mnvidia-nccl-cu12         [0m [32m----------------------------[2m--[0m[0m

#Voice Cloning

In [2]:
# Create assets directory
!mkdir -p assets

# Option 1: Upload from your computer
from google.colab import files
uploaded = files.upload()
for filename in uploaded.keys():
    !mv "{filename}" "assets/reference.mp3"

# Option 2: Download from URL
# !wget -O assets/reference.mp3 YOUR_AUDIO_URL_HERE

Saving audio.wav to audio.wav


In [3]:
#Initialise Zonos
import torch
import torchaudio
from zonos.model import Zonos
from zonos.conditioning import make_cond_dict

# Check device
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model
print("Loading model...")
model = Zonos.from_pretrained("Zyphra/Zonos-v0.1-transformer", device=device)

Using device: cuda
Loading model...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/541 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/307M [00:00<?, ?B/s]

In [None]:
# Load reference audio
print("Loading reference audio...")
wav, sampling_rate = torchaudio.load("assets/reference.mp3")
speaker = model.make_speaker_embedding(wav, sampling_rate)

# Set seed for reproducibility
torch.manual_seed(421)

def generate_speech(text, language="en-us", speed=1.0):
    """Generate speech from text"""
    print(f"Generating: {text}")

    # Create conditioning
    cond_dict = make_cond_dict(
        text=text,
        speaker=speaker,
        language=language,
        # speed=speed
    )
    conditioning = model.prepare_conditioning(cond_dict)

    # Generate audio
    codes = model.generate(conditioning)
    wavs = model.autoencoder.decode(codes).cpu()

    # Save and play
    filename = "output.wav"
    torchaudio.save(filename, wavs[0], model.autoencoder.sampling_rate)
    return filename

# Test generation
output_file = generate_speech(
    "Hello world",
    language="en-us",
    # speed=1.0
)

# Play the generated audio
from IPython.display import Audio
Audio(output_file)

Loading reference audio...


ResNet293_SimAM_ASP_base.pt:   0%|          | 0.00/397M [00:00<?, ?B/s]

ResNet293_SimAM_ASP_base_LDA-128.pt:   0%|          | 0.00/265k [00:00<?, ?B/s]

Generating: Hello world


W0802 17:03:59.105000 482 torch/_inductor/utils.py:1137] [3/0] Not enough SMs to use max_autotune_gemm mode
Generating:   3%|▎         | 69/2588 [02:07<03:46, 11.13it/s]

In [None]:
!pip install fastapi nest-asyncio pyngrok uvicorn cloudinary websockets

In [None]:
import cloudinary
import cloudinary.uploader

# Configure Cloudinary
cloudinary.config(
    cloud_name="da52uzpu4",
    api_key="545616752118824",
    api_secret="fCn5NrmoD5-WpFVHa00q53Vsw7s"
)

def upload_audio(file_path):
    """
    Upload audio file to Cloudinary and return URL

    Args:
        file_path (str): Path to the audio file

    Returns:
        str: URL of uploaded audio file
    """
    try:
        response = cloudinary.uploader.upload(
            file_path,
            resource_type="auto"
        )
        return response['secure_url']

    except Exception as e:
        print(f"Upload failed: {e}")
        return None

In [None]:
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import json
import time

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=['*'],
    allow_credentials=True,
    allow_methods=['*'],
    allow_headers=['*'],
)

class TTSRequest(BaseModel):
    text: str
    speed: float
    language: str
    pitch: float
    emotion: list[float]

@app.post('/tts')
async def tts_post(request: TTSRequest):
    start_time = time.time()

    text = request.text
    speed = request.speed
    language = request.language
    pitch = request.pitch
    emotion = request.emotion

    output = generate_speech(text, language, speed, pitch, emotion)
    url = upload_audio(f'./{output}')

    processing_time = time.time() - start_time

    return {
        'audioUrl': url,
        'processingTime': processing_time
    }

# New WebSocket endpoint
@app.websocket("/ws/tts")
async def websocket_tts(websocket: WebSocket):
    await websocket.accept()

    try:
        while True:
            # Receive message from client
            data = await websocket.receive_text()
            request_data = json.loads(data)

            # Send processing started message
            await websocket.send_text(json.dumps({
                "status": "processing",
                "message": "Starting TTS generation..."
            }))

            start_time = time.time()

            # Extract parameters
            text = request_data.get('text', '')
            speed = request_data.get('speed', 15.0)
            language = request_data.get('language', 'en-us')
            pitch = request_data.get('pitch', 20.0)
            emotion = request_data.get('emotion', [0.3077, 0.0256, 0.0256, 0.0256, 0.0256, 0.0256, 0.2564, 0.3077])

            # Send audio generation status
            await websocket.send_text(json.dumps({
                "status": "generating",
                "message": "Generating audio..."
            }))

            # Generate speech
            output = generate_speech(text, language, speed, pitch, emotion)

            generation_time = time.time() - start_time

            # Send upload status
            await websocket.send_text(json.dumps({
                "status": "uploading",
                "message": "Uploading audio...",
                "generationTime": generation_time
            }))

            # Upload to Cloudinary
            url = upload_audio(f'./{output}')

            total_processing_time = time.time() - start_time
            upload_time = total_processing_time - generation_time

            # Send final result
            await websocket.send_text(json.dumps({
                "status": "completed",
                "audioUrl": url,
                "processingTime": {
                    "total": total_processing_time,
                    "generation": generation_time,
                    "upload": upload_time
                },
                "message": "TTS generation completed successfully!"
            }))

    except WebSocketDisconnect:
        print("Client disconnected")
    except Exception as e:
        await websocket.send_text(json.dumps({
            "status": "error",
            "message": f"Error occurred: {str(e)}"
        }))

In [None]:
import nest_asyncio
from pyngrok import ngrok
import uvicorn

auth_token = "2z6GLE33Qw99xRpyn6hoATNvWSJ_6GUaHDoUQWUbdnqDgcW7D"
ngrok.set_auth_token(auth_token)

ngrok_tunnel = ngrok.connect(8000)
print('Public URL:', ngrok_tunnel.public_url)
nest_asyncio.apply()
uvicorn.run(app, port=8000)

In [None]:
# output_file = generate_speech(
#     "नमस्ते दुनिया",
#     language="hi",
#     # speed=1.0
# )

# # Play the generated audio
# from IPython.display import Audio
# Audio(output_file)

In [None]:
# import torch
# from transformers import pipeline

# pipeline = pipeline(
#     task="text2text-generation",
#     model="google/t5",
#     torch_dtype=torch.float16,
#     device=0
# )
# pipeline("translate English to Hindi: The weather is nice today.")