In [None]:
# Install required packages
!pip install fastapi uvicorn torch diffusers transformers scipy numpy pydantic python-multipart slowapi pyngrok

Collecting fastapi
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn
  Downloading uvicorn-0.32.1-py3-none-any.whl.metadata (6.6 kB)
Collecting python-multipart
  Downloading python_multipart-0.0.19-py3-none-any.whl.metadata (1.8 kB)
Collecting slowapi
  Downloading slowapi-0.1.9-py3-none-any.whl.metadata (3.0 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Collecting starlette<0.42.0,>=0.40.0 (from fastapi)
  Downloading starlette-0.41.3-py3-none-any.whl.metadata (6.0 kB)
Collecting limits>=2.3 (from slowapi)
  Downloading limits-3.14.1-py3-none-any.whl.metadata (7.2 kB)
Downloading fastapi-0.115.6-py3-none-any.whl (94 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m94.8/94.8 kB[0m [31m384.4 kB/s[0m eta [36m0:00:00[0m
[?25hDownloading uvicorn-0.32.1-py3-none-any.whl (63 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.8/63.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m


In [None]:
from fastapi import FastAPI, HTTPException, Request
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
import torch
from diffusers import AudioLDMPipeline
import numpy as np
import base64
import io
import scipy.io.wavfile
from typing import Optional
from slowapi import Limiter
from slowapi.errors import RateLimitExceeded
from contextlib import contextmanager

# Initialize FastAPI app
app = FastAPI(title="AudioLDM API",
             description="Text to Audio Generation API using AudioLDM",
             version="1.0.0")

# Initialize rate limiter
limiter = Limiter(key_func=lambda _: "global")  # Using a global limiter instead of per-IP

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Model initialization
REPO_ID = "cvssp/audioldm-s-full-v2"
device = "cuda" if torch.cuda.is_available() else "cpu"
pipe = None

class AudioRequest(BaseModel):
    prompt: str
    audio_length: Optional[float] = 5.0
    num_inference_steps: Optional[int] = 10
    guidance_scale: Optional[float] = 2.5
    negative_prompt: Optional[str] = None

@contextmanager
def get_model():
    global pipe
    if pipe is None:
        pipe = AudioLDMPipeline.from_pretrained(
            REPO_ID,
            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
        )
        pipe = pipe.to(device)
    try:
        yield pipe
    finally:
        if torch.cuda.is_available():
            torch.cuda.empty_cache()

@app.post("/generate-audio")
async def generate_audio(
    request: Request,  # Add this parameter
    audio_request: AudioRequest
):
    try:
        with get_model() as pipe:
            # Validate input parameters
            if not audio_request.prompt:
                raise HTTPException(status_code=400, detail="Prompt cannot be empty")
            if audio_request.audio_length <= 0 or audio_request.audio_length > 30:
                raise HTTPException(status_code=400, detail="Audio length must be between 0 and 30 seconds")
            if audio_request.num_inference_steps <= 0 or audio_request.num_inference_steps > 50:
                raise HTTPException(status_code=400, detail="Number of inference steps must be between 0 and 50")

            # Generate audio
            audio = pipe(
                prompt=audio_request.prompt,
                audio_length_in_s=audio_request.audio_length,
                num_inference_steps=audio_request.num_inference_steps,
                guidance_scale=audio_request.guidance_scale,
                negative_prompt=audio_request.negative_prompt
            ).audios[0]

            # Convert to WAV format
            buffer = io.BytesIO()
            scipy.io.wavfile.write(buffer, rate=16000, data=audio)
            buffer.seek(0)

            # Convert to base64
            audio_base64 = base64.b64encode(buffer.read()).decode()

            return {
                "status": "success",
                "audio_base64": audio_base64,
                "sample_rate": 16000,
                "duration": audio_request.audio_length
            }

    except Exception as e:
        raise HTTPException(status_code=500, detail=str(e))

@app.get("/")
async def root():
    return {
        "message": "AudioLDM Text-to-Audio Generation API",
        "version": "1.0.0",
        "status": "active"
    }

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


0it [00:00, ?it/s]

In [None]:
from pyngrok import ngrok
import uvicorn
import threading
import time
from IPython.display import clear_output

In [None]:
!ngrok config add-authtoken 2pqAryFpOn6pt3y4F8by2rV7eVl_HnmvLCipjgjzuxMiRCwb

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [None]:
def run_server():
    uvicorn.run(app, port=8000)

print("Starting server...")
thread = threading.Thread(target=run_server)
thread.daemon = True
thread.start()

# Wait for server startup
time.sleep(3)

# Start ngrok tunnel
public_url = ngrok.connect(8000)
print(f"\nAPI is now available at: {public_url}")
print("\nUse this URL in the test cell below")

INFO:     Started server process [496]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)


Starting server...

API is now available at: NgrokTunnel: "https://06cc-34-19-81-10.ngrok-free.app" -> "http://localhost:8000"

Use this URL in the test cell below


# TEST CODE FOR THE AUDIO API

In [None]:
import requests
import base64
import IPython.display as ipd
import io
import numpy as np
import soundfile as sf

def generate_and_play_audio(prompt, api_url):
    """
    Generate audio from prompt and play it
    """
    # Make sure to strip any trailing slashes from the API URL
    api_url = api_url.rstrip('/')

    payload = {
        "prompt": prompt,
        "audio_length": 5.0,
        "num_inference_steps": 10,
        "guidance_scale": 2.5
    }

    print(f"Generating audio for prompt: '{prompt}'")

    try:
        response = requests.post(
            f"{api_url}/generate-audio",
            json=payload,
            headers={'Content-Type': 'application/json'}
        )

        print(f"Response status code: {response.status_code}")
        print(f"Response headers: {response.headers}")

        if response.status_code == 200:
            print("Generation successful!")
            response_data = response.json()
            audio_base64 = response_data["audio_base64"]
            audio_data = base64.b64decode(audio_base64)

            # Save to file
            with open("generated_audio.wav", "wb") as f:
                f.write(audio_data)

            # Play audio
            audio_io = io.BytesIO(audio_data)
            audio_array, sample_rate = sf.read(audio_io)
            return ipd.Audio(audio_array, rate=sample_rate)
        else:
            print(f"Error {response.status_code}:")
            print(f"Response content: {response.text}")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        raise

In [None]:
# Test the API
API_URL = "https://06cc-34-19-81-10.ngrok-free.app/"  # Replace with your actual ngrok URL
test_prompt = "Techno music with a strong, upbeat tempo and high melodic riffs"
generate_and_play_audio(test_prompt, API_URL)

Generating audio for prompt: 'Techno music with a strong, upbeat tempo and high melodic riffs'


model_index.json:   0%|          | 0.00/462 [00:00<?, ?B/s]

Fetching 15 files:   0%|          | 0/15 [00:00<?, ?it/s]

text_encoder/config.json:   0%|          | 0.00/843 [00:00<?, ?B/s]

tokenizer/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/424 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/439 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/501M [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/534 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/740M [00:00<?, ?B/s]

vocoder/config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/222M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/221M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

INFO:     34.19.81.10:0 - "POST /generate-audio HTTP/1.1" 200 OK
Response status code: 200
Response headers: {'Content-Length': '426817', 'Content-Type': 'application/json', 'Date': 'Fri, 06 Dec 2024 14:10:36 GMT', 'Ngrok-Agent-Ips': '34.19.81.10', 'Server': 'uvicorn'}
Generation successful!


## stopping the server

In [None]:
# Stop the server
!kill -9 $(lsof -t -i:8000)
ngrok.kill()

kill: usage: kill [-s sigspec | -n signum | -sigspec] pid | jobspec ... or kill -l [sigspec]


NameError: name 'ngrok' is not defined