# **Assignment 05:** Exploring Gemini 2.0 Video and Audio Analysis

In [1]:
!pip install google-genai



In [2]:
# API Key setup
from google.colab import userdata
import time
api_key = userdata.get('GOOGLE_API_KEY')

In [3]:
from google import genai
client = genai.Client(api_key=api_key)

In [21]:
path = "/content/test_video_1.mp4"

In [22]:
# Upload video .
def upload_video(video_file_name):
  video_file = client.files.upload(path=video_file_name)
  while video_file.state == "PROCESSING":
      print('Waiting for video to be processed.')
      time.sleep(10)
      video_file = client.files.get(name=video_file.name or "")

  if video_file.state == "FAILED":
    raise ValueError(video_file.state)
  print(f'Video processing complete: ' + (video_file.uri or ""))

  return video_file

processed_video = upload_video(path)

Waiting for video to be processed.
Video processing complete: https://generativelanguage.googleapis.com/v1beta/files/rv9r1ujgn9gw


In [23]:
# test the video / vision capabilities of the model
from google.genai.types import Content, Part
from IPython.display import Markdown

prompt = """For each scene in this video,
            generate captions that describe the scene along with any spoken text placed in quotation marks.
            Place each caption into an object with the timecode of the caption in the video.
         """

video = processed_video

response = client.models.generate_content(
    model="gemini-2.0-flash-exp",
    contents=[
        Content(
            role="user",
            parts=[
                Part.from_uri(
                    file_uri=video.uri or "",
                    mime_type=video.mime_type or ""),
                ]),
        prompt,
    ]
)

Markdown(response.text)

```json
[
    {
        "timecode": "00:00",
        "caption": "A man with a beard, wearing a gray shirt and headphones, is looking towards the camera holding a white pen with black accents. He is sitting at a table with a floral patterned cloth."
    },
     {
        "timecode": "00:01",
         "caption": "The man adjusts the microphone on his headphones and looks down at the pen he is holding. He says, “Hello, this is Wasif. I want to record this video to test the VN capabilities of Gemini 2.0 model and also the speech to text capability.”"
   },
     {
       "timecode": "00:16",
        "caption": "The man looks at the camera while holding a white pen and continues, “So here is I have a pen. and I have a jar of coffee.”"
    },
    {
        "timecode": "00:23",
         "caption": "The man looks down to place the pen and the jar of coffee down before taking out a notebook and pen. He states, “And now I write few things over a paper.” "
    },
   {
      "timecode": "00:34",
      "caption": "The man looks up and holds up a paper to the camera that reads ‘DATE’ written in black ink."
    },
  {
        "timecode": "00:38",
         "caption":"The man looks down as he starts to draw on the paper with the black pen. "
      },
    {
      "timecode": "00:46",
        "caption":"The man looks up and holds up a paper to the camera that shows a simple drawing of a smiling face with large eyes. He states, “So here is my drawing.”"
    },
      {
        "timecode":"00:50",
        "caption":"The man puts down the paper and starts to draw another image while facing down. "
     },
    {
     "timecode":"01:11",
       "caption": "The man looks up and holds up a paper to the camera with a simple drawing of a house, sun, clouds, and birds. He states, “So here is another drawing.”"
   },
    {
         "timecode": "01:14",
        "caption":"The man puts down the paper and holds a pen in his hands while looking down."
    }
]
```

In [24]:
# test the audio capability of the model
prompt_1 = """Extract and transcribe all spoken text with timestamp in json from the audio of the provided video. Ensure the transcription is accurate and clear."""

video = processed_video

response = client.models.generate_content(
    model="gemini-2.0-flash-exp",
    contents=[
        Content(
            role="user",
            parts=[
                Part.from_uri(
                    file_uri=video.uri or "",
                    mime_type=video.mime_type or ""),
                ]),
        prompt_1,
    ]
)

Markdown(response.text)

```json
[
  {
    "start": "0:00",
    "end": "0:01",
    "text": "Hello."
  },
  {
    "start": "0:01",
    "end": "0:02",
    "text": "This is Wasif."
  },
  {
    "start": "0:03",
    "end": "0:15",
    "text": "I want to record this video to test the VN capabilities of Gemini 2.0 model and also the speech to text capability."
  },
    {
    "start": "0:16",
    "end": "0:21",
    "text": "So here is I have a pen and I have a"
  },
    {
    "start": "0:21",
    "end": "0:23",
    "text": "jar of coffee."
  },
    {
    "start": "0:24",
    "end": "0:25",
    "text": "And now"
  },
    {
    "start": "0:25",
    "end": "0:29",
    "text": "I write few things over a paper."
  },
  {
    "start": "0:34",
    "end": "0:37",
    "text": "look at it this."
  },
  {
    "start": "0:38",
    "end": "0:39",
    "text": "And now"
  },
    {
    "start": "0:39",
    "end": "0:40",
    "text": "I draw something."
  },
  {
    "start": "0:46",
    "end": "0:49",
    "text": "So here is my drawing."
  },
  {
    "start": "1:12",
    "end": "1:13",
    "text": "Here is another drawing."
  }
]
```