##### Copyright 2024 Google LLC.

In [None]:
#@title Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get started with the Gemini API: Python

<table class="tfo-notebook-buttons" align="left">
  <td>
    <a target="_blank" href="https://ai.google.dev/gemini-api/docs/get-started/python"><img src="https://ai.google.dev/static/site-assets/images/docs/notebook-site-button.png" height="32" width="32" />View on Google AI</a>
  </td>
  <td>
    <a target="_blank" href="https://colab.research.google.com/github/google/generative-ai-docs/blob/main/site/en/gemini-api/docs/get-started/python.ipynb"><img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
  </td>
  <td>
    <a target="_blank" href="https://github.com/google/generative-ai-docs/blob/main/site/en/gemini-api/docs/get-started/python.ipynb"><img src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a>
  </td>
</table>

This quickstart demonstrates how to use the Python SDK for the Gemini API, which gives you access to Google's Gemini large language models. In this quickstart, you will learn how to:

1. Set up your development environment and API access to use Gemini.
2. Generate text responses from text inputs.
3. Generate text responses from multimodal inputs (text and images).
4. Use Gemini for multi-turn conversations (chat).
5. Use embeddings for large language models.

## Prerequisites

You can run this quickstart in [Google Colab](https://colab.research.google.com/github/google/generative-ai-docs/blob/main/site/en/gemini-api/docs/get-started/python.ipynb), which runs this notebook directly in the browser and does not require additional environment configuration.

Alternatively, to complete this quickstart locally, ensure that your development environment meets the following requirements:

-  Python 3.9+
-  An installation of `jupyter` to run the notebook.

## Setup

### Install the Python SDK

The Python SDK for the Gemini API, is contained in the [`google-generativeai`](https://pypi.org/project/google-generativeai/) package. Install the dependency using pip:

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
 %cd drive
! ls

/content/drive
MyDrive


In [None]:
 %cd MyDrive/FinRAG
! ls

/content/drive/MyDrive/FinRAG
 cache				  FinRAD_preprocess_gemini_colab.ipynb	 output
'Copy of FinRAG Poster.gslides'   generator				 retriever_external
 dataset			  generator_train_test.ipynb		 retriever_internal


In [None]:
#!pip install -q -U google-generativeai

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m150.7/150.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m679.1/679.1 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[?25h

### Import packages

Import the necessary packages.

In [None]:
import pathlib
import textwrap

import google.generativeai as genai

from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

In [None]:
# Used to securely store your API key
from google.colab import userdata

In [None]:
import pandas as pd
import time
import random
import json
import numpy as np
import re

### Setup your API key

Before you can use the Gemini API, you must first obtain an API key. If you don't already have one, create a key with one click in Google AI Studio.

<a class="button button-primary" href="https://makersuite.google.com/app/apikey" target="_blank" rel="noopener noreferrer">Get an API key</a>

In Colab, add the key to the secrets manager under the "🔑" in the left panel. Give it the name `GOOGLE_API_KEY`.

Once you have the API key, pass it to the SDK. You can do this in two ways:

* Put the key in the `GOOGLE_API_KEY` environment variable (the SDK will automatically pick it up from there).
* Pass the key to `genai.configure(api_key=...)`

In [None]:
# Or use `os.getenv('GOOGLE_API_KEY')` to fetch an environment variable.
GOOGLE_API_KEY=userdata.get('gemini-paid')

genai.configure(api_key=GOOGLE_API_KEY)

## List models

Now you're ready to call the Gemini API. Use `list_models` to see the available Gemini models:

* `gemini-pro`: optimized for text-only prompts.
* `gemini-pro-vision`: optimized for text-and-images prompts.

In [None]:
from google.colab import userdata
#userdata.get('gemini-paid')

In [None]:
for m in genai.list_models():
  if 'generateContent' in m.supported_generation_methods:
    print(m.name)

models/gemini-1.0-pro-latest
models/gemini-1.0-pro
models/gemini-pro
models/gemini-1.0-pro-001
models/gemini-1.0-pro-vision-latest
models/gemini-pro-vision
models/gemini-1.5-pro-latest
models/gemini-1.5-pro-001
models/gemini-1.5-pro-002
models/gemini-1.5-pro
models/gemini-1.5-pro-exp-0801
models/gemini-1.5-pro-exp-0827
models/gemini-1.5-flash-latest
models/gemini-1.5-flash-001
models/gemini-1.5-flash-001-tuning
models/gemini-1.5-flash
models/gemini-1.5-flash-exp-0827
models/gemini-1.5-flash-002
models/gemini-1.5-flash-8b
models/gemini-1.5-flash-8b-001
models/gemini-1.5-flash-8b-latest
models/gemini-1.5-flash-8b-exp-0827
models/gemini-1.5-flash-8b-exp-0924
models/gemini-exp-1114


Note: For detailed information about the available models, including their capabilities and rate limits, see [Gemini models](https://ai.google.dev/models/gemini). There are options for requesting [rate limit increases](https://ai.google.dev/docs/increase_quota). The rate limit for Gemini-Pro models is 60 requests per minute (RPM).

The `genai` package also supports the PaLM  family of models, but only the Gemini models support the generic, multimodal capabilities of the `generateContent` method.

## Generate text from text inputs

For text-only prompts, use the `gemini-pro` model:

In [None]:
model = genai.GenerativeModel('gemini-1.5-pro-latest')
#model = genai.GenerativeModel('gemini-1.0-pro')

safety_settings = [
    {
        "category": "HARM_CATEGORY_DANGEROUS",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HARASSMENT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_HATE_SPEECH",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
        "threshold": "BLOCK_NONE",
    },
    {
        "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
        "threshold": "BLOCK_NONE",
    },
]

The `generate_content` method can handle a wide variety of use cases, including multi-turn chat and multimodal input, depending on what the underlying model supports. The available models only support text and images as input, and text as output.

In the simplest case, you can pass a prompt string to the <a href="https://ai.google.dev/api/python/google/generativeai/GenerativeModel#generate_content"><code>GenerativeModel.generate_content</code></a> method:

In simple cases, the `response.text` accessor is all you need. To display formatted Markdown text, use the `to_markdown` function:

In [None]:
#to_markdown(response.text)

> As a large language model, I can't tell you the meaning of life. That's a question philosophers and theologians have grappled with for centuries! 
> 
> The meaning of life is a personal and individual question. There's no one right answer.  It's up to each person to decide what gives their life meaning. 
> 
> Here are some things to consider:
> 
> * **Your values:** What is important to you? What do you believe in?
> * **Your purpose:** What do you want to accomplish in life? What impact do you want to make?
> * **Your experiences:** What brings you joy? What challenges you? What makes you feel fulfilled?
> 
> Ultimately, the meaning of life is what you make it. It's about finding what gives your life purpose and makes you feel fulfilled. 
> 
> If you're struggling with this question, it might be helpful to talk to a trusted friend, family member, or therapist. They can offer support and guidance as you explore what gives your life meaning. 


# Now, use gemini 1.5 pro lastest for zero-shot!





In [None]:
# Cell 1: Install dependencies
!pip install -q evaluate


In [None]:
# Cell 2: Imports & Authentication
import os
import json
import random

import PIL.Image
from evaluate import load as load_metric


In [None]:
# Cell 3: Image‐sampling utilities

def sample_method1(color_dir):
    """
    Evenly select 10 color frames from color_dir (e.g. 0.jpeg…N.jpeg).
    """
    frames = sorted(
        [f for f in os.listdir(color_dir)
         if f.endswith('.jpeg') or f.endswith('.jpg')],
        key=lambda x: int(os.path.splitext(x)[0])
    )
    N = len(frames)
    # 10 indices spaced across [0, N-1]
    idxs = [round(i * (N - 1) / 9) for i in range(10)]
    return [os.path.join(color_dir, frames[i]) for i in idxs]


def sample_method2(color_dir, depth_dir):
    """
    Use the images from the 10 color frames (method1),
    then include their corresponding depth (.png) files.
    """
    chosen = sample_method1(color_dir)
    #chosen = random.sample(ten, 5)
    depths = [
        os.path.join(depth_dir, os.path.splitext(os.path.basename(p))[0] + '.png')
        for p in chosen
    ]
    return chosen + depths


In [None]:
# Cell 4: Load test questions & answers
with open('data/splits/test_qa.json', 'r') as f:
    test_entries = json.load(f)


In [None]:
# Cell 5: Zero‐shot query helper

def ask_gemini(question: str, image_paths: list[str]) -> str:
    """
    Sends images + question to Gemini and returns the answer text.
    """
    question = "Given the attached images, " + question
    parts = []
    for path in image_paths:
        img = PIL.Image.open(path)
        parts.append(img)

    parts.append(question)
    response = model.generate_content(parts)
    return response.text.strip()


In [None]:
# Cell 6: Evaluation loop & metrics

meteor = load_metric('meteor')

def evaluate_method(method: int):
    preds, refs = [], []
    ans_types, spat_types = [], []

    for entry in test_entries:
        sid      = entry['scene_id']   # e.g. "scene0581_00"
        color_dir= f"rgbd-{sid}/color"
        depth_dir= f"rgbd-{sid}/depth"

        # 1 or 2 → choose sampling strategy
        if method == 1:
            imgs = sample_method1(color_dir)
        else:
            imgs = sample_method2(color_dir, depth_dir)

        # zero‐shot answer
        ans = ask_gemini(entry['question'], imgs)
        preds.append(ans)
        refs.append(entry['answer'])

        # record categories
        atype = entry.get('question_type', entry.get('answer_type','Other'))
        ans_types.append(atype)
        spat = entry.get('spatial_subtask',
                         atype if atype in
                           ['aggregation','placement','spatial','viewpoint']
                         else 'none')
        spat_types.append(spat)

    # overall metrics
    em_overall   = sum(p==r for p,r in zip(preds,refs)) / len(refs) * 100
    met_overall  = meteor.compute(
        predictions=preds,
        references=[[r] for r in refs]
    )['meteor'] * 100

    print(f"\n=== Method {method} Results ===")
    print(f"Overall EM:     {em_overall:.2f}%")
    print(f"Overall METEOR: {met_overall:.2f}%")

    # per-answer‐type
    print("\n-- Answer‐Type Breakdown --")
    for cat in ['Y/N','Color','Number','Other']:
        idxs = [i for i,t in enumerate(ans_types) if t==cat]
        if not idxs: continue
        em_c = sum(preds[i]==refs[i] for i in idxs)/len(idxs)*100
        mt_c = meteor.compute(
            predictions=[preds[i] for i in idxs],
            references=[[refs[i]] for i in idxs]
        )['meteor'] * 100
        print(f"{cat:7s} | EM: {em_c:5.2f}% | METEOR: {mt_c:5.2f}% | N={len(idxs)}")

    # per-spatial‐subtask
    print("\n-- Spatial Subtask Breakdown --")
    for cat in ['aggregation','placement','spatial','viewpoint']:
        idxs = [i for i,s in enumerate(spat_types) if s==cat]
        if not idxs: continue
        em_c = sum(preds[i]==refs[i] for i in idxs)/len(idxs)*100
        mt_c = meteor.compute(
            predictions=[preds[i] for i in idxs],
            references=[[refs[i]] for i in idxs]
        )['meteor'] * 100
        print(f"{cat:11s} | EM: {em_c:5.2f}% | METEOR: {mt_c:5.2f}% | N={len(idxs)}")


In [None]:
# Cell 7: Run both methods
evaluate_method(1)
evaluate_method(2)
