In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Gemini: An Overview of Multimodal Use Cases

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fgemini%2Fuse-cases%2Fintro_multimodal_use_cases.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/gemini/use-cases/intro_multimodal_use_cases.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/use-cases/intro_multimodal_use_cases.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>


| | |
|-|-|
|Author(s) | [Saeed Aghabozorgi](https://github.com/saeedaghabozorgi) |

In [None]:
!pip install --upgrade google-cloud-aiplatform




In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir("/content/drive/MyDrive/Colab Notebooks")

## Connect to MongoDB

DataBase : LiveAI
<br>
Collection: Video

In [None]:
!pip install "pymongo[srv]"




In [None]:
!pip install pytube



In [None]:
from pytube import YouTube


In [None]:
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi
from urllib.parse import quote_plus

# Username and password
username = "tanniTest"
password = "Irammad@123"

# Encode the password
encoded_password = quote_plus(password)
print("Encoded Password:", encoded_password)

uri = f"mongodb+srv://{username}:{encoded_password}@cluster0.2nnvu.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0"
print("Connection URI:", uri)


# Create a new client and connect to the server
client = MongoClient(uri, server_api=ServerApi('1'))
# Send a ping to confirm a successful connection
try:
    client.admin.command('ping')
    print("Pinged your deployment. You successfully connected to MongoDB!")
except Exception as e:
    print(e)

Encoded Password: Irammad%40123
Connection URI: mongodb+srv://tanniTest:Irammad%40123@cluster0.2nnvu.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0
Pinged your deployment. You successfully connected to MongoDB!


In [None]:
# Create a new database and collection
#db = client["LiveAI"]  # Replace with your desired database name
#collection = db["Video"]  # Replace with your desired collection name

#print("Database and collection created!")

## Overview

In this notebook, you will explore a variety of different use cases enabled by multimodality with Gemini 1.5 Flash.

### Gemini

Gemini is a family of generative AI models developed by Google DeepMind that is designed for multimodal use cases. The Gemini API gives you access to the Gemini 1.0 Pro Vision, Gemini 1.0 Pro, Gemini 1.5 Pro and Gemini 1.5 Flash models.

### Gemini API in Vertex AI

The Gemini API in Vertex AI provides a unified interface for interacting with Gemini models. There are currently four models available in the Gemini API:

- **Gemini 1.0 Pro model** (`gemini-1.0-pro`): Designed to handle natural language tasks, multiturn text and code chat, and code generation.
- **Gemini 1.0 Pro Vision model** (`gemini-1.0-pro-vision`): Supports multimodal prompts. You can include text, images, and video in your prompt requests and get text or code responses.
- **Gemini 1.5 Pro model** (`gemini-1.5-pro`): A foundation model that performs well at a variety of multimodal tasks such as visual understanding, classification, summarization, and creating content from image, audio and video..
- **Gemini 1.5 Flash model** (`gemini-1.5-flash`): A purpose-built multimodal model that provides speed and efficiency for high-volume, quality, cost-effective apps.

For more information, see the [Generative AI on Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview) documentation.


### Objectives

This notebook demonstrates a variety of multimodal use cases that Gemini can be used for.

#### Multimodal use cases

Compared to text-only LLMs, Gemini 1.5's multimodality can be used for many new use-cases:

Example use cases with **text and image(s)** as input:

- Detecting objects in photos
- Understanding screens and interfaces
- Understanding of drawing and abstraction
- Understanding charts and diagrams
- Recommendation of images based on user preferences
- Comparing images for similarities, anomalies, or differences

Example use cases with **text and video** as input:

- Generating a video description
- Extracting tags of objects throughout a video
- Extracting highlights/messaging of a video


### Costs

This tutorial uses billable components of Google Cloud:

- Vertex AI

Learn about [Vertex AI pricing](https://cloud.google.com/vertex-ai/pricing) and use the [Pricing Calculator](https://cloud.google.com/products/calculator/) to generate a cost estimate based on your projected usage.


## Getting Started


### Install Vertex AI SDK for Python


In [None]:
#%pip install --upgrade --user google-cloud-aiplatform

### Restart current runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel.

In [None]:
# Restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the following cell to authenticate your environment. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).


In [None]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

## connect with SQLLite DB

In [None]:
import sqlite3
import os
from moviepy.editor import VideoFileClip  # Install moviepy if not already installed: pip install moviepy

# Function to convert video format (optional)
def convert_video(input_path, output_path, codec="libx264"):
    try:
        clip = VideoFileClip(input_path)
        clip.write_videofile(output_path, codec=codec)
        print(f"Video converted and saved to {output_path}.")
    except Exception as e:
        print(f"Video conversion failed: {e}")

# Function to read raw bytes from a video file
def read_video_file(file_path):
    try:
        with open(file_path, 'rb') as file:
            raw_bytes = file.read()
            print(f"Read {len(raw_bytes)} bytes from {file_path}.")
            return raw_bytes
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None
    except Exception as e:
        print(f"Error reading video file: {e}")
        return None

# SQLite database setup
db_name = "video_data.db"  # Change as needed
db_path = os.path.abspath(db_name)
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Create a table for storing videos as BLOBs
cursor.execute("""
CREATE TABLE IF NOT EXISTS videos (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
    type TEXT NOT NULL,
    data BLOB DEFAULT NULL ,
    metadata TEXT DEFAULT NULL, -- Optional metadata field
    description TEXT DEFAULT NULL
);
""")
conn.commit()
print(f"Database created at {db_path} with table 'videos'.")



Database created at /content/drive/MyDrive/Colab Notebooks/video_data.db with table 'videos'.


### Define Google Cloud project information and initialize Vertex AI

Initialize the Vertex AI SDK for Python for your project:

In [None]:
# Define project information
PROJECT_ID = "gen-lang-client-0676335328"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI
import vertexai

vertexai.init(project=PROJECT_ID, location=LOCATION)

### Import libraries


In [None]:
from vertexai.generative_models import GenerationConfig, GenerativeModel, Image, Part

## Use the Gemini 1.5 Flash model

Gemini 1.5 Flash (`gemini-1.5-flash`) is a multimodal model that supports multimodal prompts. You can include text, image(s), and video in your prompt requests and get text or code responses.


### Load Gemini 1.5 Flash model


In [None]:
multimodal_model = GenerativeModel("gemini-1.5-flash")

### Define helper functions


In [None]:
import http.client
import typing
import urllib.request

import IPython.display
from PIL import Image as PIL_Image
from PIL import ImageOps as PIL_ImageOps


def display_images(
    images: typing.Iterable[Image],
    max_width: int = 600,
    max_height: int = 350,
) -> None:
    for image in images:
        pil_image = typing.cast(PIL_Image.Image, image._pil_image)
        if pil_image.mode != "RGB":
            # RGB is supported by all Jupyter environments (e.g. RGBA is not yet)
            pil_image = pil_image.convert("RGB")
        image_width, image_height = pil_image.size
        if max_width < image_width or max_height < image_height:
            # Resize to display a smaller notebook image
            pil_image = PIL_ImageOps.contain(pil_image, (max_width, max_height))
        IPython.display.display(pil_image)


def get_image_bytes_from_url(image_url: str) -> bytes:
    with urllib.request.urlopen(image_url) as response:
        response = typing.cast(http.client.HTTPResponse, response)
        image_bytes = response.read()
    return image_bytes


def load_image_from_url(image_url: str) -> Image:
    image_bytes = get_image_bytes_from_url(image_url)
    return Image.from_bytes(image_bytes)


def display_content_as_image(content: str | Image | Part) -> bool:
    if not isinstance(content, Image):
        return False
    display_images([content])
    return True


def display_content_as_video(content: str | Image | Part) -> bool:
    if not isinstance(content, Part):
        return False
    part = typing.cast(Part, content)
    file_path = part.file_data.file_uri.removeprefix("gs://")
    video_url = f"https://storage.googleapis.com/{file_path}"
    IPython.display.display(IPython.display.Video(video_url, width=600))
    return True


def print_multimodal_prompt(contents: list[str | Image | Part]):
    """
    Given contents that would be sent to Gemini,
    output the full multimodal prompt for ease of readability.
    """
    for content in contents:
        if display_content_as_image(content):
            continue
        if display_content_as_video(content):
            continue
        print(content)

## Generating a video description

Gemini can also extract tags throughout a video:

> Video: https://storage.googleapis.com/github-repo/img/gemini/multimodality_usecases_overview/mediterraneansea.mp4

In [None]:
def generate_text_from_stream(contents, multimodal_model):
    """
    Converts the streamed response from multimodal_model.generate_content into a UTF-8 string.

    Args:
        contents (list): The input content (e.g., video or prompt) to pass to the model.
        multimodal_model (GenerativeModel): The initialized multimodal model.

    Returns:
        str: The concatenated UTF-8 text from the stream.
    """
    try:
        # Call the model's generate_content with stream=True
        responses = multimodal_model.generate_content(contents, stream=True)

        # Iterate through the generator and collect text responses
        text_output = []
        for response in responses:
            if hasattr(response, "text"):
                text_output.append(response.text)

        # Concatenate the text responses
        full_text = "".join(text_output)

        # Ensure the result is UTF-8 encoded
        return full_text.encode("utf-8").decode("utf-8")

    except Exception as e:
        raise RuntimeError(f"Error converting stream to text: {e}")

In [None]:
#db = client["LiveAI"]
#collection = db["Video"]

In [None]:
import base64
import vertexai
from vertexai.generative_models import GenerativeModel, Part, SafetySetting


def generate():
    vertexai.init(project="gen-lang-client-0676335328", location="us-central1")
    model = GenerativeModel(
        "gemini-1.5-flash-002",
    )
    responses = model.generate_content(
        ["""Transcribe the video""", video1],
        generation_config=generation_config,
        safety_settings=safety_settings,
        stream=True,
    )
    text_output = []
    for response in responses:
        if hasattr(response, "text"):
            text_output.append(response.text)
    for response in responses:
        print(response.text, end="")

        # Concatenate the text responses
    full_text = "".join(text_output)
    return full_text.encode("utf-8").decode("utf-8")


video1 = Part.from_uri(
    mime_type="video/mp4",
    uri="gs://liveai/W1D1.mp4",
)

generation_config = {
    "max_output_tokens": 8192,
    "temperature": 1,
    "top_p": 0.95,
}

safety_settings = [
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HATE_SPEECH,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
    SafetySetting(
        category=SafetySetting.HarmCategory.HARM_CATEGORY_HARASSMENT,
        threshold=SafetySetting.HarmBlockThreshold.OFF
    ),
]

generated_text=generate()

In [None]:
generated_text

"Welcome back. So we are in this uh lecture. We're gonna be talking about a few foundational stones of machine learning and deep learning. And we're gonna start obviously with the king of deep learning today. That is obviously not TensorFlow or PyTorch, but it is your friend Python. So I am Terry, obviously and as uh you can imagine, I'm super excited to have you in in the class. Um let's sort of um take a high level view of what Python is and what it can do. So Python has become a very, very popular library. Um especially uh you know, this is a few years old, but uh the whole idea is to kind of see how Python has become so popular as a language. You can see here already Python kind of, you know, taking the cake and and moving ahead uh in this field. So it's it's a very popular library uh that has made machine learning and deep learning very, very exciting field for all of us. What else is out there? Let's take a look. So it has an awesome ecosystem. So that is the reason number one. S

In [None]:
generated_text

"Welcome back. So we are in this uh lecture. We're gonna be talking about a few foundational stones of machine learning and deep learning. And we're gonna start obviously with the king of deep learning today. That is obviously not TensorFlow or PyTorch, but it is your friend Python. So I am Terry, obviously and as uh you can imagine, I'm super excited to have you in in the class. Um let's sort of um take a high level view of what Python is and what it can do. So Python has become a very, very popular library. Um especially uh you know, this is a few years old, but uh the whole idea is to kind of see how Python has become so popular as a language. You can see here already Python kind of, you know, taking the cake and and moving ahead uh in this field. So it's it's a very popular library uh that has made machine learning and deep learning very, very exciting field for all of us. What else is out there? Let's take a look. So it has an awesome ecosystem. So that is the reason number one. S

In [None]:
metadata= input ("MetaData from user")

MetaData from userweek1


In [None]:
!pwd

/content/drive/MyDrive/Colab Notebooks


In [None]:
# Function to save raw bytes as BLOB in the database
def save_video_to_db(file_name, raw_data, metadata, description):
    try:
        cursor.execute(
            """
            INSERT INTO videos (type, data, metadata, description)
            VALUES (?, ?, ?, ?)
            """,
            (file_name, raw_data, metadata, description)
        )
        conn.commit()
        print(description)
        print(f"Video {file_name} saved to database.")
    except Exception as e:
        print(f"Failed to save video to database: {e}")


def retrieve_video_from_db(file_name, output_path):
    try:
        # Fetch the data and description
        cursor.execute(
            "SELECT data, description FROM videos WHERE type = ?",
            (file_name,)
        )
        result = cursor.fetchone()
        if result:
            # Write the video data to the output file
            with open(output_path, 'wb') as file:
                file.write(result[0])
            print(f"Video {file_name} retrieved and saved as {output_path}.")
            print(f"Description: {result[1]}")

            # Return the description
            return result[1]  # Return the description text
        else:
            print(f"No video found with name {file_name}.")
            return None
    except Exception as e:
        print(f"Failed to retrieve video: {e}")
        return None


# Example Usage
input_video_path = "W1D1.mov"  # Replace with your video file path /content/W1D1.mov
converted_video_path = "converted_example.mp4"
output_video_path = "retrieved_example.mp4"

# Optional: Convert video to a different format
#convert_video(input_video_path, converted_video_path)

# Read video file as raw bytes
video_data = read_video_file(converted_video_path)

# Save video to the database
if video_data:
    save_video_to_db("Video", video_data, metadata,generated_text)

# Retrieve video from the database
retrieved_text=retrieve_video_from_db("Video", output_video_path)
print(retrieved_text)




Read 13132450 bytes from converted_example.mp4.
Welcome back. So we are in this uh lecture. We're gonna be talking about a few foundational stones of machine learning and deep learning. And we're gonna start obviously with the king of deep learning today. That is obviously not TensorFlow or PyTorch, but it is your friend Python. So I am Terry, obviously and as uh you can imagine, I'm super excited to have you in in the class. Um let's sort of um take a high level view of what Python is and what it can do. So Python has become a very, very popular library. Um especially uh you know, this is a few years old, but uh the whole idea is to kind of see how Python has become so popular as a language. You can see here already Python kind of, you know, taking the cake and and moving ahead uh in this field. So it's it's a very popular library uh that has made machine learning and deep learning very, very exciting field for all of us. What else is out there? Let's take a look. So it has an awesome

In [None]:

# Function to add a new video entry
def add_video_entry(type, raw_data, metadata, description):
    try:
        cursor.execute("""
        INSERT INTO videos (type, data, metadata, description)
        VALUES (?, ?, ?, ?)
        """, (type, raw_data, metadata, description))
        conn.commit()
        print(f"New text entry '{type}' added successfully.")
    except Exception as e:
        print(f"Failed to add video entry: {e}")

file_path = 'W1D5.txt'

# Open the file with a specific encoding
with open(file_path, 'r', encoding='utf-8') as file:
    content = file.read()

print(content)
# Example usage
# Replace these values with actual data
type_text = "Text"
video_metadata = "week1day1"
text_description = content
# Simulating raw video data
text_data = None  # Replace with actual binary data

# Add the new entry
add_video_entry(type_text, text_data, video_metadata, text_description)



Working on your first Machine Learning problem

In this end-to-end Python machine learning tutorial, you will learn how to use Scikit-Learn to build and tune a supervised learning model.

The goal of this notebook is introduce you to one of the most flexible and useful libraries for Machine Learning in Python. We will skip the theory and math in this tutorial, but we will still recommend great resources for learning those. Moreover, we will give you a guideline on how to structure a ML problem given a task and a dataset.

Workflow stages

The notebook workflow goes through six stages:

Question or problem definition;
Acquire training and testing data;
Wrangle, prepare, clean the data;
Analyze, identify patterns, and explore the data;
Model, predict and solve the problem;
Visualize, report, and present the problem solving steps and final solution.

This is a generalized approach to tackle a ML problem. However, sometimes you may want to:

Combine multiple workflow stages i.e. analyzing 

In [None]:
# Close the database connection
conn.close()

In [None]:
# # Restart kernel after installs so that your environment can access the new packages
# import IPython

# app = IPython.Application.instance()
# app.kernel.do_shutdown(True)