In [None]:
"""
This notebook is used to play with the dataset, displaying images and understand the structure of the dataset. 
Note: we can use this to check if duplicate images also have duplicate captions.
"""
%load_ext autoreload
%autoreload 2
import pandas as pd
import os
import numpy as np

In [None]:
# specify dataset and paths to work on
DATASET_ENTITY_COUNT = 20_000 # 1_900_000 20_000 222
DATASET_BASE_PATH = f"/Users/yavuz/data/LAION-{DATASET_ENTITY_COUNT}/"

METADATA_PATH = DATASET_BASE_PATH + "metadata.parquet"
IMAGES_PATH = DATASET_BASE_PATH + "images/"

vector_path = DATASET_BASE_PATH + "vectors/"

assert os.path.exists(METADATA_PATH)
assert os.path.exists(IMAGES_PATH)
assert os.path.exists(vector_path)

In [None]:
metadata=pd.read_parquet(METADATA_PATH)
metadata["TEXT"]

In [None]:
# display single match
id = 69
metadata.iloc[id]

In [None]:
id, metadata["TEXT"][id], metadata["URL"][id], metadata["index"][id]

In [None]:
from IPython.display import display, Image
def get_image(vector_id: int, images_path: str):
    """
    Given a vector id and base images path (IMAGES_PATH), returns the image.
    """
    shard = str(vector_id // 10000).zfill(5)
    index = str(vector_id % 10000).zfill(4)
    image_path = f"{images_path}/{shard}/{shard}{index}.jpg"
    return Image(filename=image_path) 
display(get_image(id, IMAGES_PATH))

In [None]:
# load vectors for id
text_vector_path = vector_path+"text_vectors"
image_vector_path = vector_path+"image_vectors"

text_vectors = np.load(text_vector_path+".npy")
image_vectors = np.load(image_vector_path+".npy")

In [None]:
# dot it with itself
np.dot(text_vectors[id], text_vectors[id])
text_vectors[id]

In [None]:
np.dot(image_vectors[id], image_vectors[id])
image_vectors[id]

In [None]:
# look at placeholder images:
placeholder_images_file_name = "placeholder_images-ALLPAIRS-3percent-duplicates-bad.npy"
placeholder_images = np.load(vector_path+placeholder_images_file_name)
placeholder_images.shape

In [None]:
# play with audio generation
from diffusers import AudioLDM2Pipeline
import torch
import scipy

In [None]:
repo_id = "cvssp/audioldm2"
pipe = AudioLDM2Pipeline.from_pretrained(repo_id, torch_dtype=torch.float32)

if torch.cuda.is_available():
    pipe = pipe.to("cuda")

In [None]:
def generate_audio_from_text(text: str, pipe: AudioLDM2Pipeline):
    prompt = "Sound associateed with: " + text
    negative_prompt = "Low quality."

    # run the generation
    audio = pipe(
        prompt,
        negative_prompt=negative_prompt,
        num_inference_steps=10, # original was 200
        audio_length_in_s=3.0, #original was 5.0
        num_waveforms_per_prompt=2, #original was 3
    )
    return audio

In [None]:
audio = generate_audio_from_text("Oval_drum_small2", pipe)

In [None]:
a=audio.audios

In [None]:
scipy.io.wavfile.write("audio.wav", rate=16000, data=a[0])

In [None]:
pipe.vae

In [None]:
vae_encoder = pipe.vae.encoder
a[0]

In [None]:
audio_tensor = a[0]
latent_representation = vae_encoder(audio_tensor)

In [None]:
latent_representation = vae_encoder(a)

In [None]:
vae_encoder = pipe.vae

In [None]:
audio[2]

In [None]:
audio2 = generate_audio_from_text("Oval_drum_small2", pipe)

In [None]:
audio2.audios

In [None]:
actual_audio2= audio2.audios