In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Visual Question Answering (VQA) with Imagen on Vertex AI

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/vision/getting-started/visual_question_answering.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/vision/getting-started/visual_question_answering.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/vision/getting-started/visual_question_answering.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>


## Overview

[Imagen on Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/image/overview) (image Generative AI) offers a variety of features:
- Image generation
- Image editing
- Visual captioning
- Visual question answering

This notebook focuses on **visual question answering** only.

[Visual question answering (VQA) with Imagen](https://cloud.google.com/vertex-ai/docs/generative-ai/image/visual-question-answering) can understand the content of an image and answer questions about it. The model takes in an image and a question as input, and then using the image as context to produce one or more answers to the question.

The visual question answering (VQA) can be used for a variety of use cases, including:
- assisting the visually impaired to gain more information about the images
- answering customer questions about products or services in the image
- creating interactive learning environment and providing interactive learning experiences

### Objectives

In this notebook, you will learn how to use the Vertex AI Python SDK to:

- Answering questions about images using the Imagen's visual question answering features

- Experiment with different parameters, such as:
    - number of answers to be provided by the model

### Costs

This tutorial uses billable components of Google Cloud:
- Vertex AI (Imagen)

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.

## Getting Started

### Install Vertex AI SDK, other packages and their dependencies

In [None]:
%pip install --upgrade --user google-cloud-aiplatform>=1.29.0

[0m

### Restart current runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel.

In [None]:
# Restart kernel after installs so that your environment can access the new packages
import IPython
import time

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>



### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, you will need to authenticate your environment. To do this, run the new cell below. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [None]:
import sys

if 'google.colab' in sys.modules:

    # Authenticate user to Google Cloud
    from google.colab import auth
    auth.authenticate_user()

### Define Google Cloud project information (Colab only)

If you are running this notebook on Google Colab, you need to define Google Cloud project information to be used. In the following cell, you will define the information, import Vertex AI package, and initialize it. This step is also not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [None]:
if 'google.colab' in sys.modules:

    # Define project information
    PROJECT_ID = "" # @param {type:"string"}
    LOCATION = "" # @param {type:"string"}

    # Initialize Vertex AI
    import vertexai
    vertexai.init(project=PROJECT_ID, location=LOCATION)

# Strat Here

In [None]:
!pip install --upgrade google-cloud-aiplatform

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import base64
import os
import vertexai
from vertexai.preview.generative_models import GenerativeModel, Part

def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


def generate(prompt, original_img, ours_img, dragdif_img, sde_img):
  model = GenerativeModel("gemini-pro-vision")
  responses = model.generate_content(
    [prompt, original_img, ours_img, dragdif_img, sde_img],
    generation_config={
        "max_output_tokens": 2048,
        "temperature": 0.4,
        "top_p": 1,
        "top_k": 32
    },
    )

  return responses.text


def write_2_txt(evaluation: str, path: str):
    # Extract directory from the path
    directory = os.path.dirname(path)

    # Check if the directory exists, and create it if it doesn't
    if not os.path.exists(directory):
        os.makedirs(directory)

    # Now, write to the file
    with open(path, 'w') as file:
        file.write(evaluation)


In [None]:
from pathlib import Path
from vertexai.preview.generative_models import Part
from tqdm import tqdm
dataset_dir = Path('/content/drive/MyDrive/dataset')
gooddrag_dir = Path('/content/drive/MyDrive/gooddrag')
dragdiffusion_dir = Path('/content/drive/MyDrive/dragdiffusion')
sde_dir = Path('/content/drive/MyDrive/final_drag/sde')
result_dir = Path('/content/drive/MyDrive/GScore')

prompt = prompt = '''Conduct a detailed evaluation of three modified images, labeled 'A', 'B', and 'C', in comparison to an original image (Image 1). Image 1 serves as the baseline and will not be evaluated. Focus on assessing the quality of 'A' (Image 2), 'B' (Image 3), and 'C' (Image 4), particularly in terms of their naturalness and the presence or absence of artifacts. Examine how well each algorithm preserves the integrity of the original image while introducing modifications. Look for any signs of distortions, unnatural colors, pixelation, or other visual inconsistencies. Rate each image on a scale from 1 to 10, where 10 represents excellent quality with seamless modifications, and 1 indicates poor quality with significant and noticeable artifacts. Provide a comprehensive analysis for each rating, highlighting specific aspects of the image that influenced your evaluation. Answers must be in English.'''
for i in range(10):
  result_dir = Path(f'/content/drive/MyDrive/GScore/result_{i}')
  for item in tqdm(dataset_dir.iterdir(), desc='Evaluating:'):
    img_name = item.name
    original_image = Part.from_data(data=base64.b64decode(encode_image(dataset_dir / img_name/ 'original.jpg')), mime_type="image/jpeg")
    gooddrag_image = Part.from_data(data=base64.b64decode(encode_image(gooddrag_dir / img_name/ 'output_image.png')), mime_type="image/png")
    dragdiffusion_image = Part.from_data(data=base64.b64decode(encode_image(dragdiffusion_dir / img_name/ 'output_image.png')), mime_type="image/png")
    sde_image = Part.from_data(data=base64.b64decode(encode_image(sde_dir / f'{img_name}.png')), mime_type="image/png")
    evaluation_file_path = result_dir / f'{img_name}.txt'
    cur_evaluation = generate(prompt, original_image, gooddrag_image, dragdiffusion_image, sde_image)
    write_2_txt(cur_evaluation, evaluation_file_path)
