In [3]:
import os
import time
import torch
import IPython.display as ipd
from PIL import Image
from pydub import AudioSegment

# Riffusion imports
from riffusion.riffusion_pipeline import RiffusionPipeline
from riffusion.datatypes import InferenceInput, PromptInput
from riffusion.spectrogram_image_converter import SpectrogramImageConverter
from riffusion.spectrogram_params import SpectrogramParams

# ---------------------------
# 1. Initialize the pipeline
# ---------------------------
pipe = RiffusionPipeline.from_pretrained(
    "riffusion/riffusion-model-v1",
    torch_dtype=torch.float32
)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
pipe.to(device)

# -------------------------------------
# 2. Prepare inference inputs (prompts)
# -------------------------------------
start_prompt = PromptInput(
    prompt="serene meditation music, nature sounds, no vocals, 60bpm",
    seed=42,
    denoising=0.75
)

end_prompt = PromptInput(
    prompt="serene meditation music, nature sounds, no vocals, 60bpm",
    seed=123,
    denoising=0.75
)

inference_input = InferenceInput(
    start=start_prompt,
    end=end_prompt,
    alpha=0.75,              # no interpolation if alpha=0
    num_inference_steps=50, # e.g. 50 steps
    seed_image_id="vibes"   # for reference, not auto-loaded in this version
)

# --------------------------------------------------
# 3. Load or create an init_image (spectrogram seed)
# --------------------------------------------------
# If you have a local 'vibes.png' or any spectrogram image, load it directly:
seed_image_path = "/Users/dev/workspace/product-pocs-python/vibes.png"
if not os.path.exists(seed_image_path):
    raise FileNotFoundError(
        f"Cannot find {seed_image_path}. "
        "Please place an appropriate spectrogram seed image in this path."
    )

init_image = Image.open(seed_image_path).convert("RGB")

# -------------------------------------------
# 4. Generate the spectrogram via riffuse()
# -------------------------------------------
start_time = time.time()
print("Generating spectrogram...")

try:
    # NOTE: init_image is now explicitly passed
    spectrogram_image = pipe.riffuse(inference_input, init_image=init_image)
    print(f"Spectrogram generated in {time.time() - start_time:.2f} seconds.")
except Exception as e:
    print(f"Spectrogram generation failed: {e}")
    raise

# --------------------------------------------
# 5. Convert the spectrogram to an audio clip
# --------------------------------------------
print("Converting spectrogram to audio...")

converter = SpectrogramImageConverter(params=SpectrogramParams(), device=device)
audio_segment = converter.audio_from_spectrogram_image(spectrogram_image)

# Optional: slice/normalize
duration_sec = 30
audio_segment = audio_segment[:duration_sec * 1000]
audio_segment = audio_segment.normalize(headroom=5)

output_filename = f"meditation_{int(time.time())}.wav"
audio_segment.export(output_filename, format="wav")

print(f"Audio saved to: {output_filename}")
ipd.display(ipd.Audio(output_filename))


vae/diffusion_pytorch_model.safetensors not found


Loading pipeline components...:   0%|          | 0/7 [00:00<?, ?it/s]

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.


cpu
Generating spectrogram...


  0%|          | 0/38 [00:00<?, ?it/s]

Spectrogram generated in 143.70 seconds.
Converting spectrogram to audio...
Audio saved to: meditation_1742014840.wav
