### Script for sampling from Riffusion-v1 (unmodified) with data used to train Riff-Cnet

Run this script to see how riffusion behaves without control-net when passing in the same conditioned inputs as what Riff-Cnet receives and is trained on.

In [None]:
# # UNCOMMENT AND RUN THIS BLOCK IF USING GOOGLE COLAB

# from google.colab import drive
# drive.mount("/content/drive/")

# ## cd into desired directory 
# %cd drive/MyDrive/<my-directory>
# %ls

# # pull git repo 
# # get training data
# !git clone https://github.com/zachary-shah/riff-cnet.git
# %cd riff-cnet/riffusion_img2img

# # may need to install some dependencies (only run once)
# !pip install -q -r requirements.txt
# !pip install Pillow==9.0.0
# !pip install Pillow==9.4.0

In [2]:
import os, sys
sys.path.append('../')
from pathlib import Path

import PIL
from matplotlib import cm
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

from datatypes import Img2ImgInput
from riffusion_img2img_pipeline import RiffusionImg2ImgPipeline
from riffusion.spectrogram_image_converter import SpectrogramImageConverter
from riffusion.spectrogram_params import SpectrogramParams
from cnet_riff_dataset import CnetRiffDataset

In [None]:
# create spectrogram to audio converter 
params = SpectrogramParams(
    sample_rate=44100, 
    min_frequency=0,
    max_frequency=10000,
)
converter = SpectrogramImageConverter(params=params, device= "cuda")

# load pipeline 
pipeline = RiffusionImg2ImgPipeline.load_checkpoint(
    checkpoint="riffusion/riffusion-model-v1",
    use_traced_unet=True,
    device="cuda",
)

In [44]:
# function to sample from the current state of riffusion using training data format for this project
def sample_vanilla_riffusion(pipeline, converter, seed_img, target, prompt, output_dir):
    
    # setup model input
    inputs = Img2ImgInput(
            text_prompt=prompt,
            seed=0,
            init_spectrogram=seed_img,
            mask_image=None,
            negative_prompt=None,
            denoising=0.75,
            guidance=7.0,
            ddim_steps=50,
            ddim_eta=0.0,
        )

    # Execute the model to get the spectrogram image
    sample = pipeline.riffuse(
        inputs,
        use_reweighting=True,
    )

    # save sampled image
    sample.save(os.path.join(output_dir, f"{prompt}_sample.png"))

    # Reconstruct audio from the image
    segment = converter.audio_from_spectrogram_image(
        sample,
        apply_filters=True,
    )
    # save audio
    segment.export(os.path.join(output_dir, f"{prompt}_sample.wav"), format="wav")

    # save target image
    target.save(os.path.join(output_dir, f"{prompt}_target.png"))

    # save target audio
    target_audio = converter.audio_from_spectrogram_image(
        target,
        apply_filters=True,
    )
    target_audio.export(os.path.join(output_dir, f"{prompt}_target.wav"), format="wav")

    # save seed image
    seed_img.save(os.path.join(output_dir, f"{prompt}_seed.png"))

    # save seed audio
    seed_audio = converter.audio_from_spectrogram_image(
        seed_img,
        apply_filters=True,
    )
    seed_audio.export(os.path.join(output_dir, f"{prompt}_seed.wav"), format="wav")


In [None]:
# here, we try sampling from riffusion by using the canny-edge sources as seeds. it doesnt work very well...
output_dir = "riff_samples_seeded_by_sources"
os.makedirs(output_dir, exist_ok=True)

test_dataset = CnetRiffDataset("train-data/")
len(test_dataset)

# get bunch or rock and pop examples
for (i, item) in enumerate(test_dataset):
    if np.mod(i, 100) == 0:  
      prompt = item['txt']
      target = Image.fromarray(np.uint8((item['jpg'] + 1)/2*255))
      source = Image.fromarray(item['hint'])
      # seed is the source image --> this will just produce samples that are noise
      sample_vanilla_riffusion(pipeline, converter, source, target, prompt, output_dir)
    if i > 2000:
      break

# get one reggae example
for (i, item) in enumerate(test_dataset):
    prompt = item['txt']
    if "reggae" in prompt:
      target = Image.fromarray(np.uint8((item['jpg'] + 1)/2*255))
      source = Image.fromarray(item['hint'])
      # seed is the source image --> this will just produce samples that are noise
      sample_vanilla_riffusion(pipeline, converter, source, target, prompt, output_dir)
      break

# get one country example
for (i, item) in enumerate(test_dataset):
    prompt = item['txt']
    if "country" in prompt:
      target = Image.fromarray(np.uint8((item['jpg'] + 1)/2*255))
      source = Image.fromarray(item['hint'])
      # seed is the source image --> this will just produce samples that are noise
      sample_vanilla_riffusion(pipeline, converter, source, target, prompt, output_dir)
      break

In [None]:
# now, we try sampling from riffusion by using the canny-edge sources as seeds. the audio sounds
# more coherent, but the background audio is not well preserved. 

output_dir = "riff_samples_seeded_by_targets"
os.makedirs(output_dir, exist_ok=True)

test_dataset = CnetRiffDataset("train-data/")
len(test_dataset)

# get bunch or rock and pop examples
for (i, item) in enumerate(test_dataset):
    if np.mod(i, 100) == 0:
      prompt = item['txt']
      target = Image.fromarray(np.uint8((item['jpg'] + 1)/2*255))
      source = Image.fromarray(item['hint'])
      # seed is the target --> produces much better sound
      sample_vanilla_riffusion(pipeline, converter, target, target, prompt, output_dir)
    if i > 2000:
      break

# get one reggae example
for (i, item) in enumerate(test_dataset):
    prompt = item['txt']
    if "reggae" in prompt:
      target = Image.fromarray(np.uint8((item['jpg'] + 1)/2*255))
      source = Image.fromarray(item['hint'])
      # seed is the target --> produces much better sound
      sample_vanilla_riffusion(pipeline, converter, target, target, prompt, output_dir)
      break

# get one country example
for (i, item) in enumerate(test_dataset):
    prompt = item['txt']
    if "country" in prompt:
      target = Image.fromarray(np.uint8((item['jpg'] + 1)/2*255))
      source = Image.fromarray(item['hint'])
      # seed is the target --> produces much better sound
      sample_vanilla_riffusion(pipeline, converter, target, target, prompt, output_dir)
      break