In [1]:
import warnings
warnings.simplefilter('ignore')
import os
import time
import json
import shutil
import traceback
import pandas as pd
from tqdm.notebook import tqdm
import google.generativeai as genai
from audiocraft.models import musicgen
from scipy.io.wavfile import write as wav_write
from IPython.display import display, HTML, Video, Audio

2024-07-04 15:26:59.060130: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Gemini Prompt

In [2]:
music_prompt_examples = '''
'A dynamic blend of hip-hop and orchestral elements, with sweeping strings and brass, evoking the vibrant energy of the city',
'Smooth jazz, with a saxophone solo, piano chords, and snare full drums',
'90s rock song with electric guitar and heavy drums'.
'''

json_schema = '''
{"Content Description": "string", "Music Prompt": "string"}
'''

gemni_instructions = f'''
You are a music supervisor who analyzes the content and tone of images and videos to describe music that fits well with the mood, evokes emotions, and enhances the narrative of the visuals. Given an image or video, describe the scene and generate a prompt suitable for music generation models. Use keywords related to genre, instruments, mood, context, and setting to craft a concise single-sentence prompt, like:

{music_prompt_examples}

You must return your response using this JSON schema: {json_schema}
'''

print(gemni_instructions)


GOOGLE_API_KEY = ''
genai.configure(api_key=GOOGLE_API_KEY)


You are a music supervisor who analyzes the content and tone of images and videos to describe music that fits well with the mood, evokes emotions, and enhances the narrative of the visuals. Given an image or video, describe the scene and generate a prompt suitable for music generation models. Use keywords related to genre, instruments, mood, context, and setting to craft a concise single-sentence prompt, like:


'A dynamic blend of hip-hop and orchestral elements, with sweeping strings and brass, evoking the vibrant energy of the city',
'Smooth jazz, with a saxophone solo, piano chords, and snare full drums',
'90s rock song with electric guitar and heavy drums'.


You must return your response using this JSON schema: 
{"Content Description": "string", "Music Prompt": "string"}




In [3]:
mllm_model = genai.GenerativeModel('gemini-1.5-flash-latest')
musicgen_model = musicgen.MusicGen.get_pretrained('facebook/musicgen-stereo-small',device='cuda')
musicgen_model.set_generation_params(duration=30)

def process_video_in_gemini(video_file_name):
    video_file = genai.upload_file(video_file_name)
    while video_file.state.name == "PROCESSING":
        time.sleep(3)
        video_file = genai.get_file(video_file.name)

    if video_file.state.name == "FAILED":
        raise ValueError(video_file.state.name)
    
    response = mllm_model.generate_content(
        [video_file, 'Explain what is happening in this video'],
        request_options={"timeout":600},
        safety_settings={
            genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH: genai.types.HarmBlockThreshold.BLOCK_NONE,
            genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
            genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: genai.types.HarmBlockThreshold.BLOCK_NONE,
            genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
            genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
            
        }
    )
    
    cleaned_response = mllm_model.generate_content([
        response.text,
        gemni_instructions, 
    ],
    safety_settings={
        genai.types.HarmCategory.HARM_CATEGORY_HATE_SPEECH: genai.types.HarmBlockThreshold.BLOCK_NONE,
        genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
        genai.types.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: genai.types.HarmBlockThreshold.BLOCK_NONE,
        genai.types.HarmCategory.HARM_CATEGORY_HARASSMENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
        genai.types.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: genai.types.HarmBlockThreshold.BLOCK_NONE,
    }
)

    return json.loads(cleaned_response.text.strip("```json\n"))

def generate_music(music_prompt:list, audio_file_name):
    if not isinstance(music_prompt, list):
        music_prompt = [music_prompt]
    result = musicgen_model.generate(music_prompt, progress=False)
    result = result.squeeze().cpu().numpy().T
    sample_rate = musicgen_model.sample_rate
    
    filenames = []

    if len(result.shape) > 2:
        result = result.transpose(2, 0, 1)
        for idx, audio in  enumerate(result):
            filename = audio_file_name.split(".")[0] + f"_{idx}." + audio_file_name.split(".")[1]
            wav_write(filename, sample_rate, audio)
            filenames.append(filename)
        
    else:
        wav_write(audio_file_name, sample_rate, result)
        filenames.append(audio_file_name)


    return filenames

In [4]:
video_root = "videos"
audio_root = "audio"
# Delete existing audio directory
if os.path.exists(audio_root):
    shutil.rmtree(audio_root)
os.makedirs(audio_root, exist_ok=True)

video_filenames = os.listdir(video_root)

video_paths = []
audio_paths = []
music_prompts = []
music_descriptions = []

for video_filename in tqdm(video_filenames, desc="Processing Videos"):
    basename = os.path.splitext(video_filename)[0]
    video_path = os.path.join(video_root, video_filename)
    audio_path = os.path.join(audio_root, f"{basename}.wav")
    try:
        generated_prompt = process_video_in_gemini(video_path)
        description, music_prompt = generated_prompt["Content Description"], generated_prompt["Music Prompt"]
    except Exception:
        print(f"Error processing video {video_path}: {traceback.format_exc()}")
        continue
    audio_path_list = generate_music([music_prompt] * 3, audio_path)

    video_paths.append(video_path)
    audio_paths.append(audio_path_list)
    music_prompts.append(music_prompt)
    music_descriptions.append(description)


Processing Videos:   0%|          | 0/10 [00:00<?, ?it/s]

Error processing video videos/9145C4C8F212F8383F822646B553D483_video_dashinit.mp4: Traceback (most recent call last):
  File "/tmp/ipykernel_53234/810802512.py", line 20, in <module>
    generated_prompt = process_video_in_gemini(video_path)
  File "/tmp/ipykernel_53234/1034809467.py", line 40, in process_video_in_gemini
    return json.loads(cleaned_response.text.strip("```json\n"))
  File "/home/ani/miniconda3/envs/vm/lib/python3.9/json/__init__.py", line 346, in loads
    return _default_decoder.decode(s)
  File "/home/ani/miniconda3/envs/vm/lib/python3.9/json/decoder.py", line 337, in decode
    obj, end = self.raw_decode(s, idx=_w(s, 0).end())
  File "/home/ani/miniconda3/envs/vm/lib/python3.9/json/decoder.py", line 355, in raw_decode
    raise JSONDecodeError("Expecting value", s, err.value) from None
json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)



In [5]:
# Save the variables in pickle file
data = {
    "video_paths": video_paths,
    "audio_paths": audio_paths,
    "music_prompts": music_prompts,
    "music_descriptions": music_descriptions
}

df = pd.DataFrame(data)
df.to_pickle("music_data.pkl")

In [6]:
# Load from pickle file
data = pd.read_pickle("music_data.pkl")
video_paths = data['video_paths']
audio_paths = data['audio_paths']
music_prompts = data['music_prompts']
music_descriptions = data['music_descriptions']


def create_html_element(url, tag):
    if tag == 'video':
        return f'<video controls width="700"><source src="{url}" type="video/mp4"></video>'
    elif tag == 'audio':
        return f'<audio controls><source src="{url}" type="audio/mpeg"></audio>'

data = {
    "Video": [create_html_element(url, 'video') for url in video_paths],
    "Audio 1": [create_html_element(audio[0], 'audio') for audio in audio_paths],
    "Audio 2": [create_html_element(audio[1], 'audio') for audio in audio_paths],
    "Audio 3": [create_html_element(audio[2], 'audio') for audio in audio_paths],
    "Prompt": music_prompts,
    "Description": music_descriptions
}

df = pd.DataFrame(data)

display(HTML(df.to_html(escape=False)))

Unnamed: 0,Video,Audio 1,Audio 2,Audio 3,Prompt,Description
0,,,,,"Upbeat and cheerful indie pop with catchy melodies, bright acoustic guitars, and a driving beat, reflecting the positive and energetic tone of the video.","The video is a lighthearted and informative guide to helpful travel apps. It features a bright and cheerful aesthetic with vibrant visuals of travel scenes, apps, and animated elements. The tone is positive and friendly, encouraging viewers to download the apps and enjoy their travels."
1,,,,,"A calming and focused instrumental track with gentle piano chords, soft cello, and subtle percussion, reflecting the meticulous process of the artist's drawing.","The video shows a hand drawing a fox with colored pencils. The artist uses different shades of brown, grey, and black to create a realistic portrait of the fox. The video also shows the artist working on different parts of the fox's face, including its eyes, nose, and fur. The drawing is a very detailed and intricate piece of art, and the artist takes great care to get every detail perfect."
2,,,,,"Upbeat and ironic indie pop with a driving bassline and playful synth melodies, creating a lighthearted and humorous atmosphere.","A man is talking to the camera, trying to convince viewers that procrastination is not a bad thing, but a skill and a form of rebellion against capitalism. He's trying to be humorous and make people laugh."
3,,,,,"Upbeat and playful acoustic guitar with light percussion, with a touch of whimsy and a hint of mischievousness, evoking the feeling of lighthearted fun and friendly pranking.","A compilation video of responses to a Reddit post asking for harmless ways to screw with people. The video features a lighthearted tone, showcasing people's creativity in coming up with playful pranks and jokes."
4,,,,,"A fast-paced, quirky, and satirical synth-pop track with a playful and slightly menacing undertone, using a blend of synth melodies, electronic drums, and quirky sound effects to reflect the chaotic energy of the video.","A humorous video showcasing the most unpleasant places in Boston. The tone is sarcastic and mocking, with a focus on the absurdity of the situations. The video uses a montage format, quickly transitioning between various locations and actions, all designed to make the viewer cringe."
5,,,,,"Upbeat and energetic pop music with a playful and catchy melody, featuring bright synths, driving drums, and a rhythmic bassline, capturing the excitement and joy of preparing a delicious meal.","A person is preparing a delicious-looking meal on a flat top grill, incorporating rice, vegetables, chicken, steak, and shrimp. The scene is dynamic and lively with the use of a spatula to stir the ingredients and the application of various sauces and seasonings. The focus is on the creation and presentation of a flavorful and enticing dish."
6,,,,,"Upbeat indie pop with a lighthearted and whimsical feel, featuring acoustic guitar, light percussion, and a cheerful vocal melody, capturing the spirit of discovery and creative exploration.","A montage showcasing diverse creative workshops and activities in the Boston area, highlighting the city's artistic and cultural vibrancy. The video starts with a relaxed and focused atmosphere during chocolate bar making, shifts to a more playful and artistic energy in the resin art scene, and maintains a creative yet relaxed vibe throughout. It ends with a call to action, encouraging viewers to find similar experiences in the area."
7,,,,,"An upbeat and energetic blend of synthwave and 80s rock, with driving basslines, catchy synth melodies, and powerful drum beats, evoking the nostalgic atmosphere of a classic arcade.","The video shows a person entering Roxy's Arcade, a speakeasy in Cambridge with a variety of arcade games, pinball machines, and classic video games. The video features a person playing a racing game and pans to reveal the different games available. The overall atmosphere is vibrant, energetic, and nostalgic."
8,,,,,"A suspenseful and dramatic orchestral score with soaring strings, pounding percussion, and a deep, rumbling bassline, evoking the power and danger of the shark.","A group of people on a boat are fishing and have caught a massive shark that circles the boat, coming very close to the people on board. They are visibly shocked and excited by the size of the shark."
