In [1]:
import warnings
warnings.simplefilter('ignore')
import os
import time
import json
import shutil
import traceback
import pandas as pd
from tqdm.notebook import tqdm
import google.generativeai as genai
from audiocraft.models import musicgen
from scipy.io.wavfile import write as wav_write
from IPython.display import display, HTML, Video, Audio

2024-07-03 08:42:56.674858: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Gemini Prompt

In [2]:
music_prompt_examples = '''
'A dynamic blend of hip-hop and orchestral elements, with sweeping strings and brass, evoking the vibrant energy of the city',
'Smooth jazz, with a saxophone solo, piano chords, and snare full drums',
'90s rock song with electric guitar and heavy drums'.
'''

json_schema = '''
{"Content Description": "string", "Music Prompt": "string"}
'''

gemni_instructions = f'''
You are a music supervisor who analyzes the content and tone of images and videos to describe music that fits well with the mood, evokes emotions, and enhances the narrative of the visuals. Given an image or video, describe the scene and generate a prompt suitable for music generation models. Use keywords related to genre, instruments, mood, context, and setting to craft a concise single-sentence prompt, like:

{music_prompt_examples}

You must return your response using this JSON schema: {json_schema}
'''

print(gemni_instructions)


GOOGLE_API_KEY = 'AIzaSyALlIi2z90_mOq8Ac-Yy3ZK_dOxp2ps_L8'
genai.configure(api_key=GOOGLE_API_KEY)


You are a music supervisor who analyzes the content and tone of images and videos to describe music that fits well with the mood, evokes emotions, and enhances the narrative of the visuals. Given an image or video, describe the scene and generate a prompt suitable for music generation models. Use keywords related to genre, instruments, mood, context, and setting to craft a concise single-sentence prompt, like:


'A dynamic blend of hip-hop and orchestral elements, with sweeping strings and brass, evoking the vibrant energy of the city',
'Smooth jazz, with a saxophone solo, piano chords, and snare full drums',
'90s rock song with electric guitar and heavy drums'.


You must return your response using this JSON schema: 
{"Content Description": "string", "Music Prompt": "string"}




In [3]:
mllm_model = genai.GenerativeModel('gemini-1.5-flash-latest')
musicgen_model = musicgen.MusicGen.get_pretrained('facebook/musicgen-stereo-small',device='cuda')
musicgen_model.set_generation_params(duration=30)

def process_video_in_gemini(video_file_name):
    video_file = genai.upload_file(video_file_name)
    while video_file.state.name == "PROCESSING":
        time.sleep(3)
        video_file = genai.get_file(video_file.name)

    if video_file.state.name == "FAILED":
        raise ValueError(video_file.state.name)
    
    response = mllm_model.generate_content(
        [video_file, 'Explain what is happening in this video'],
        request_options={"timeout":600}
    )

    if response is None:
        raise ValueError("Response is None")

    cleaned_response = mllm_model.generate_content([
        response.text,
        gemni_instructions
    ])

    return json.loads(cleaned_response.text.strip("```json\n"))

def generate_music(music_prompt:list, audio_file_name):
    if not isinstance(music_prompt, list):
        music_prompt = [music_prompt]
    result = musicgen_model.generate(music_prompt, progress=False)
    result = result.squeeze().cpu().numpy().T
    sample_rate = musicgen_model.sample_rate
    
    filenames = []

    if len(result.shape) > 2:
        result = result.transpose(2, 0, 1)
        for idx, audio in  enumerate(result):
            filename = audio_file_name.split(".")[0] + f"_{idx}." + audio_file_name.split(".")[1]
            wav_write(filename, sample_rate, audio)
            filenames.append(filename)
        
    else:
        wav_write(audio_file_name, sample_rate, result)
        filenames.append(audio_file_name)


    return filenames

In [4]:
video_root = "videos"
audio_root = "audio"
# Delete existing audio directory
if os.path.exists(audio_root):
    shutil.rmtree(audio_root)
os.makedirs(audio_root, exist_ok=True)

video_filenames = os.listdir(video_root)

video_paths = []
audio_paths = []
music_prompts = []
music_descriptions = []

for video_filename in tqdm(video_filenames, desc="Processing Videos"):
    basename = os.path.splitext(video_filename)[0]
    video_path = os.path.join(video_root, video_filename)
    audio_path = os.path.join(audio_root, f"{basename}.wav")
    try:
        generated_prompt = process_video_in_gemini(video_path)
        description, music_prompt = generated_prompt["Content Description"], generated_prompt["Music Prompt"]
    except Exception:
        print(f"Error processing video {video_path}: {traceback.format_exc()}")
        continue
    audio_path_list = generate_music([music_prompt] * 3, audio_path)

    video_paths.append(video_path)
    audio_paths.append(audio_path_list)
    music_prompts.append(music_prompt)
    music_descriptions.append(description)


Processing Videos:   0%|          | 0/10 [00:00<?, ?it/s]

Error processing video videos/9E4F395C77BC2E31D6E3F725CFA3F3B5_video_dashinit.mp4: Traceback (most recent call last):
  File "/tmp/ipykernel_55467/810802512.py", line 20, in <module>
    generated_prompt = process_video_in_gemini(video_path)
  File "/tmp/ipykernel_55467/2414198949.py", line 23, in process_video_in_gemini
    response.text,
  File "/home/ani/miniconda3/envs/vm/lib/python3.9/site-packages/google/generativeai/types/generation_types.py", line 412, in text
    raise ValueError(
ValueError: Invalid operation: The `response.text` quick accessor requires the response to contain a valid `Part`, but none were returned. Please check the `candidate.safety_ratings` to determine if the response was blocked.

Error processing video videos/9145C4C8F212F8383F822646B553D483_video_dashinit.mp4: Traceback (most recent call last):
  File "/tmp/ipykernel_55467/810802512.py", line 20, in <module>
    generated_prompt = process_video_in_gemini(video_path)
  File "/tmp/ipykernel_55467/24141989

In [20]:
# Save the variables in pickle file
data = {
    "video_paths": video_paths,
    "audio_paths": audio_paths,
    "music_prompts": music_prompts,
    "music_descriptions": music_descriptions
}

df = pd.DataFrame(data)
df.to_pickle("music_data.pkl")

In [21]:
# Load from pickle file
data = pd.read_pickle("music_data.pkl")
video_paths = data['video_paths']
audio_paths = data['audio_paths']
music_prompts = data['music_prompts']
music_descriptions = data['music_descriptions']


def create_html_element(url, tag):
    if tag == 'video':
        return f'<video controls width="700"><source src="{url}" type="video/mp4"></video>'
    elif tag == 'audio':
        return f'<audio controls><source src="{url}" type="audio/mpeg"></audio>'

data = {
    "Video": [create_html_element(url, 'video') for url in video_paths],
    "Audio 1": [create_html_element(audio[0], 'audio') for audio in audio_paths],
    "Audio 2": [create_html_element(audio[1], 'audio') for audio in audio_paths],
    "Audio 3": [create_html_element(audio[2], 'audio') for audio in audio_paths],
    "Prompt": music_prompts,
    "Description": music_descriptions
}

df = pd.DataFrame(data)

display(HTML(df.to_html(escape=False)))

Unnamed: 0,Video,Audio 1,Audio 2,Audio 3,Prompt,Description
0,,,,,"Upbeat indie folk with acoustic guitar, banjo, and a driving rhythm, capturing the joy of exploring and the beauty of the natural landscape.","Two friends are hiking in a mountainous region, enjoying stunning views while discussing the benefits of five travel apps: PeakFinder, Splitwise, Pack Point, Translators, and Navigation. The scene is lighthearted and adventurous, with a sense of camaraderie and excitement for the journey ahead."
1,,,,,"A serene and delicate piano melody with subtle string arrangements, evoking a sense of calm focus and artistic precision.","The video showcases a person meticulously drawing a realistic fox with colored pencils. The process starts with the eye, then the nose, and finally, the intricate details of the fur. The video ends with a close-up of the finished fox drawing, highlighting the artist's skill and the beauty of the artwork."
2,,,,,"Upbeat and playful pop music with a quirky, comedic twist, featuring a bright synth melody, a catchy drumbeat, and humorous sound effects.","A comedic video featuring a man who is procrastinating on his tasks, joking about 'fighting capitalism' by doing things like deep cleaning his house instead of studying. The tone is lighthearted and humorous, with a focus on relatable procrastination struggles. The overall vibe is playful and encouraging, inviting viewers to join in the procrastination."
3,,,,,"Upbeat and energetic folk-pop instrumental, with acoustic guitar, banjo, and mandolin, evoking a carefree and cheerful summer evening cookout.","The video captures a casual, joyful cooking experience on an outdoor griddle. The scene is vibrant and energetic, with the sizzling of the griddle and the aromas of cooking food creating a sensory feast. The cooking process involves a variety of ingredients and techniques, creating a lively and engaging atmosphere."
4,,,,,"Upbeat and energetic indie pop with a playful and whimsical feel, featuring acoustic guitar, light percussion, and soaring vocals that capture the joy and creativity of these unique experiences.","A montage video showcasing various unique experiences in Boston and Massachusetts, like chocolate bar making, resin art creation, ceramic bowl making, dosa making, floral candleholder making, and floral cake decorating. The video features short clips of people engaging in these activities, highlighting the fun and creativity involved, and aiming to inspire viewers to try these experiences themselves."
5,,,,,"An upbeat and energetic synthwave track with a retro 8-bit feel, featuring driving basslines and arpeggiated synths, capturing the vibrant energy and nostalgia of a classic arcade.","A person enters Roxy's Arcade, a speakeasy in Cambridge. The entrance is marked by a 'Strictly 21+' sign. The scene is bustling with people playing classic arcade games like Mario Kart, pinball, and Mortal Kombat, suggesting a lively and energetic atmosphere."
