# Set Up

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

from PIL import Image

from lavis.models import load_model_and_preprocess
model, vis_processors, _ = load_model_and_preprocess(
    name="blip_caption", model_type="large_coco", is_eval=True, device=device
)

import openai
openai.api_key = 'use-your-own-api-key'

import os
import replicate
os.environ["REPLICATE_API_TOKEN"] = 'use-your-own-api-key'

from requests import get
import string
import random

# Get Several Captions about Image

In [None]:
def get_captions(location, num_captions):
    raw_image = Image.open(location).convert("RGB")
    display(raw_image.resize((596, 437)))
    image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
    captions = model.generate({"image": image}, use_nucleus_sampling=True, num_captions=num_captions)
    for caption in captions:
        print(caption)

    return captions

# Get a Compressed Sentence

In [None]:
def get_sentence(captions):
    initial_msg = "I'm doing Image captioning with deep learning model. If I feed the model a image, it gives me several sentences. But I can't fully trust the model. So if i give you the sentences, read them and give me a compressed sentence. Not just adding. I don't need any description. I just want a 'compressed' sentence. I'll try this many time."

    messages = [
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {'role': 'user', 'content': initial_msg},
    ]
    
    res = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=messages
    )

    messages.append({
        'role': 'assistant',
        'content': res['choices'][0]['message']['content']
    })
    
    messages.append({
        'role': 'user',
        'content': ' '.join(captions)
    })
    
    res = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=messages
    )

    messages.append({
        'role': 'assistant',
        'content': res['choices'][0]['message']['content']
    })
    
    res = res['choices'][0]['message']['content']

    try:
        sentence = res.split(':')[1].replace('\n', ' ').replace("'", "").replace('"','').strip()
    except:
        sentence = res.replace('\n', ' ').replace("'", "").replace('"','').strip()
        
    print("Compressed Sentence: " + sentence)

    return sentence

# Get Genres

In [None]:
def get_genres(sentence):
    messages = [
        {'role': 'system', 'content': 'You are a helpful assistant.'},
        {'role': 'user', 'content': '[' + sentence + ']' + 'Recommend me three music genres that go well with this sentence. Seperate genres in comma. Just print genres. No discription. No number.'}
    ]
    
    res = openai.ChatCompletion.create(
        model='gpt-3.5-turbo',
        messages=messages
    )
    
    res = res['choices'][0]['message']['content']

    try:
        genre = res.split(':')[1].replace('\n', ' ').replace("'", "").replace('"','').strip()
    except:
        genre = res.replace('\n', ' ').replace("'", "").replace('"','').strip()

    genres = genre.split(", ")
    sentence_with_genre = [sentence + ", Genre: " + genre for genre in genres]
    for i in sentence_with_genre:
        print(i)
        
    return sentence_with_genre

# Combine Tasks

In [None]:
def img2txt(location="./docs/_static/merlion.png", num_captions=3):
    captions = get_captions(location, num_captions)
    sentence = get_sentence(captions)
    sentence_with_genre = get_genres(sentence)
    return sentence_with_genre

# Generate Random String for File Name

In [None]:
def generate_random_string():
    characters = string.ascii_lowercase + string.ascii_uppercase + string.digits
    random_string = ''.join(random.choice(characters) for _ in range(16))
    return random_string

# Save Music

In [None]:
def download(url):
    if not os.path.exists('./audio'):
        os.makedirs('./audio')
    path = './audio/' + generate_random_string() + '.wav'
    with open(path, "wb") as file:
        response = get(url)
        file.write(response.content)

# Get Musics

In [None]:
def get_musics(location="./docs/_static/tent.jpg", num_captions=3):
    sentence_with_genre = img2txt(location, num_captions=3)
    output = [replicate.run(
        "riffusion api key",
        input={"prompt_a": prompt_a}
    )['audio'] for prompt_a in sentence_with_genre]

    for i, out in enumerate(output):
        print(str(i+1) + "." + out)
        download(out)
    
    return output

# Result

In [None]:
location = "./docs/_static/merlion.png"
num_captions = 3
output = get_musics(location, num_captions)