# Projeto Final

## Integrantes
* Cynara Costa
* Lucas Melo
* Raissa Heimman
* Thays
* Victor Miguel de Morais Costa
* Weslley Batista

## Instalação de Pacotes

In [1]:
!pip install gradio

[0m

In [2]:
!pip install -q -U google-generativeai

[0m

In [3]:
!python3 -m pip install -U git+https://github.com/facebookresearch/audiocraft#egg=audiocraft

Collecting audiocraft
  Cloning https://github.com/facebookresearch/audiocraft to /tmp/pip-install-d02s8rmc/audiocraft_da6b9c1cf3ac419e8b179712fe42a42a
  Running command git clone --filter=blob:none --quiet https://github.com/facebookresearch/audiocraft /tmp/pip-install-d02s8rmc/audiocraft_da6b9c1cf3ac419e8b179712fe42a42a
  Resolved https://github.com/facebookresearch/audiocraft to commit 69fea8b290ad1b4b40d28f92d1dfc0ab01dbab85
  Preparing metadata (setup.py) ... [?25l[?25hdone
[0m

In [4]:
!pip install --upgrade pip
!pip install --upgrade transformers scipy

[0m

## Importação de Pacotes

In [5]:
import torch
import numpy
import scipy
import pathlib
import textwrap
import PIL.Image
import gradio as gr
from google.colab import userdata
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown
from audiocraft.models import musicgen
from audiocraft.utils.notebook import display_audio

In [6]:
import vertexai
from vertexai.preview.vision_models import Image, ImageGenerationModel

from google.colab import auth

gcp_project_id = userdata.get("GCP_PROJECT_ID")

auth.authenticate_user(project_id=gcp_project_id)

vertexai.init(project=gcp_project_id)

## Implementação do App

In [7]:
GEMINI_API_KEY = userdata.get("GEMINI_API_KEY")

genai.configure(api_key=GEMINI_API_KEY)

In [8]:
class MusicGenWrapper:
    """Wrapper class for interacting with the MusicGen pre-trained model."""

    def __init__(self):
        """
        Initializes the wrapper and sets up a text-to-song model based on a pre-trained model from MusicGen.
        """

        self.modelMusic = musicgen.MusicGen.get_pretrained('small', device='cuda')
        self.modelMusic.set_generation_params(duration=30)

        return

    def text_to_song(self, track_description):
        """
        Gets a short description of a music track, generates it and stores it as '.wav' in the current runtime environment.

        Args:
        track_description: The simple description of the audio track to be generated as a string.

        Returns:
        The name of the file path to the generated audio file in the '.wav' format.
        """

        track_file_path = "single.wav"
        response = self.modelMusic.generate([track_description], progress=True)
        track_sampling_rate = self.modelMusic.sample_rate
        scipy.io.wavfile.write(track_file_path, rate=track_sampling_rate, data=response[0, 0].cpu().numpy())

        return track_file_path

In [9]:
class GeminiWrapper:
    """Wrapper class for interacting with the Google Gemini API."""

    def __init__(self):
        """
        Initializes the wrapper with your Google Gemini API key.
        """

        self.image_to_text_model = genai.GenerativeModel("gemini-pro-vision")
        self.text_to_text_model = genai.GenerativeModel("gemini-pro")
        self.text_to_image_model = ImageGenerationModel.from_pretrained("imagegeneration@005")
        return

    def image_to_text(self, image):
        """
        Gets a detailed description of a given image using the Gemini API.

        Args:
        image: An image in the format () whose description will be obtained by the use of the Gemini API as a string.

        Returns:
        The detailed description of the uploaded image as a string.

        Raises:
        Exception: If the API request fails.
        """

        image = PIL.Image.fromarray(image)
        description_prompt = """
          Please, provide a very detailed description with at least 200 words, in English,
          about the image that is being sent."""
        response = self.image_to_text_model.generate_content([description_prompt, image])
        return response.text

    def text_to_sentiment(self, image_description):
        """
        Gets the general vibe or sentiment of a given image description as text by using the Gemini API.

        Args:
        image_description: An image_description as a string.

        Returns:
        The brief description of the feelings/sentiments/vibes that one person could get by reading the provided image description as a string.

        Raises:
        Exception: If the API request fails.
        """

        vibe_sentiment_prompt = """
        In this request, I am providing an image description to you after the character ':'.
        As a response, I want you to provide to me a possible brief description of the vibes, sentiments and feelings (in English!)
        that one person could get by reading such image description. However, I don't want you to put descriptive elements of the image in your response.
        I want you to focus only on the feelings, sentiments and vibes, as I've already mentioned. Also make sure to not use a bulletpoints list or something like that.
        Instead, describe such a thing with a single sentence"""
        complete_prompt = f"{vibe_sentiment_prompt}: {image_description}"
        response = self.text_to_text_model.generate_content(complete_prompt)
        return response.text

    def text_to_text(self, image_vibe_sentiment, single_description):
        single_vibe_prompt = """
        In this request, I am providing you a detailed description of a song. Such description might contain a lot of different elements
        (such as which musical instruments are being used in the song, how they are being played, the intensity and rythm of each instruments, influences from other artists, bands and/or musical genres).
        The description will start after the next ':' character. As a response, I want you to provide to me a possible brief description of the vibes, sentiments and feeling (in English!!) that one person
        could get by reading such description. I want you to focus ONLY on the feelings, sentiments and vibes, as I've already mentioned. Moreover, make sure to not use a bulletpoints list or something like that in your response.
        Instead, give your response with a single sentence"""
        single_vibe_sentiment = self.text_to_text_model.generate_content(f"{single_vibe_prompt}: {single_description}")

        single_name_prompt = """
        In this request, I am providing you two elements as an ordered list after the character ':'.
        The first element is the description of the vibes, feelings and sentiments that one person get by looking at a certain image.
        The second element is the description of the vibes, feelings and sentiments that one person get reading the description of a certain song.
        As a response, based on these two descriptions, I want you to generate the name of a fictional song that captures the ideas conveyed by these two descriptions.
        Here goes the ordered list of the mentioned elements"""
        final_prompt = f"""
        {single_name_prompt}:
        1) {image_vibe_sentiment}
        2) {single_vibe_sentiment}
        """
        response = self.text_to_text_model.generate_content(final_prompt)

        return response.text


    def sentiment_to_image_vertex(self, artist_band_name, single_name, single_vibe_sentiment):
        sentiment_to_image_prompt = f"""
        In this request, I am providing one element in the form of an ordered list (a brief description of the feelings, sentiments and vibes that one person could get by listening to a certain song).
        As a response, based on this last brief description, I want you to generate an image with a scenario that conveys the main ideas of the previously mentioned brief description.
        The elements inside the image MUST match the sentiments, feelings and vibes provided in such description.
        Moreover, the scenario must be consistent with the description.
        Ok. Here goes the ordered list of three elements:
        1) Brief description of the feelings, sentiments and vibes of the song: {single_vibe_sentiment}
        """

        images = self.text_to_image_model.generate_images(prompt=sentiment_to_image_prompt)
        images[0].save(location="single_cover.png", include_generation_parameters=True)

        return "single_cover.png"

In [10]:
"""
If your function accepts more than one argument, as is the case above, pass a list of input components to inputs,
with each input component corresponding to one of the arguments of the function, in order.
The same holds true if your function returns more than one value: simply pass in a list of components to outputs.
"""

gemini = GeminiWrapper()
music_gen = MusicGenWrapper()

def get_num_tracks(artist_band_name, single_vibe_image, single_description):
    image_description = gemini.image_to_text(single_vibe_image) # Obtém a descrição detalhada (com no mínimo 200 palavras) da imagem de entrada que vai servir para obtenção da vibe do Single.
    image_vibe_sentiment = gemini.text_to_sentiment(image_description) # Obtém a descrição (em poucas palavras) da vibe da imagem de entrada.

    single_name = gemini.text_to_text(image_vibe_sentiment, single_description) # Obtém o nome do Single, dados: Descrição da vibe do Single, Descrição musical do Single.
    single_cover = gemini.sentiment_to_image_vertex(artist_band_name, single_name, image_vibe_sentiment) # Obtém a capa do Single dados: Nome do artista/banda, Descrição da vibe do Single, Descrição musical do Single.

    track_file_path = music_gen.text_to_song(single_description)

    return [artist_band_name, single_name, image_description, image_vibe_sentiment, "single_cover.png", "single.wav"]

# The Interface class is designed to create demos for machine learning models which accept one or more inputs, and return one or more outputs.
apollo_demo = gr.Interface(
    description="""
    # Apollo

    ### O que é?
    * Apollo é uma ferramenta simples para geração de Singles (ou melhor dizendo, 'sneak peeks' de 30 segundos de um Single) que segue as instruções fornecidas pelo usuário.
    * Basta fornecer algumas entradas específicas, explicadas abaixo, e, ao final do processo, você obterá o Single como resultado e poderá baixá-lo.

    ### Entradas
    * __Nome do artista/banda responsável pelo Single:__ Aqui, o usuário pode escolher livremente o nome do artista/banda que irá 'compor' o Single em questão.
    * __Imagem que representa a vibe do Single:__ Uma imagem que será usada para extrair o "sentimento geral" que o Single deseja transmitir para o público.
    * __Descrição musical do Single:__ Uma descrição textual individual a respeito do Single a ser gerado. Aqui, o usuário tem total liberdade para descrever como quer que o single seja. O usuário pode descrever o uso de instrumentos musicais, a maneira como os intrumentos devem ser tocados, a intensidade e ritmo de cada um. Além disso, pode citar influências de gêneros musicais, artistas, bandas. O limite é a sua imaginação! E, lembre-se, quanto mais detalhado, melhor!

    ### Saídas
    * __Nome do Artista:__ O nome do artista/banda que produziu o Single gerado, escolhido cuidadosamente pelo deus Apollo pessoalmente.
    * __Nome do Single:__ O nome do Single gerado, escolhido cuidadosamente pelo deus Apollo pessoalmente.
    * __Capa do Single:__ A capa do Single gerado, ilustrada exclusivamente pelo próprio Apollo.
    * __Single:__ O arquivo do Single gerado no formato '.wav' disponível para download.

    """,
    fn=get_num_tracks, # You can pass any function that you want to wrap with a UI. Here, we saw a simple function, but it could be anything from a music generator to the prediction function of a pretrained machine learning model.
    inputs=[
        gr.Textbox(lines=1, max_lines=1, label="Nome do artista/banda responsável pelo Single"),
        gr.Image(label="Imagem que representa a 'vibe' do Single"),
        gr.Textbox(lines=3, max_lines=10, label="Descrição musical do Single"),
    ],
    outputs=[
        gr.Textbox(lines=1, max_lines=1, label="Nome do artista/banda"),
        gr.Textbox(lines=1, max_lines=1, label="Nome do Single gerado"),
        gr.Textbox(lines=3, max_lines=10, label="Descrição da imagem de entrada"),
        gr.Textbox(lines=3, max_lines=10, label="Sentimentos/Emoções/Vibes inferidas da descrição da imagem de entrada"),
        gr.Image(label="Capa do Single"),
        gr.Audio(label="Single")
    ],
    allow_flagging="never",
    clear_btn=gr.Button(visible=False),
    submit_btn=gr.Button(value="Gerar"),
)

apollo_demo.launch(debug=True, share=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


state_dict.bin:   0%|          | 0.00/841M [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

compression_state_dict.bin:   0%|          | 0.00/236M [00:00<?, ?B/s]



Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://52d74a64a266611afa.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/gradio/queueing.py", line 495, in call_prediction
    output = await route_utils.call_process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/route_utils.py", line 235, in call_process_api
    output = await app.get_blocks().process_api(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1627, in process_api
    result = await self.call_function(
  File "/usr/local/lib/python3.10/dist-packages/gradio/blocks.py", line 1173, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/usr/local/lib/python3.10/dist-packages/anyio/to_thread.py", line 33, in run_sync
    return await get_asynclib().run_sync_in_worker_thread(
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 877, in run_sync_in_worker_thread
    return await future
  File "/usr/local/lib/python3.10/dist-packages/anyio/_backends/_asyncio.py", line 807, in run
    re

Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://52d74a64a266611afa.gradio.live


