# YouTube Video Analysis with Gemini

steps to complete:

* Use the [YouTube Data API](https://developers.google.com/youtube/v3/getting-started#before-you-start) to find videos of interest, including by search query and channel.
* Summarize YouTube videos from a specific query and channel using Gemini
* Use batch prediction to extract a specific set of structured outputs from a larger set of YouTube videos
* Get information about and extract insights from those videos by aggregating Gemini's extracted results in BigQuery

### env config

In [90]:
import os
import sys
import ipykernel

# from dotenv import load_dotenv
# load_dotenv()  # this loads the .env script for use below

PROJECT_ID = os.getenv("PROJECT_ID")
LOCATION = os.getenv("LOCATION")
PREFIX = os.getenv("PREFIX")

print(f"PROJECT_ID: {PROJECT_ID}")
print(f"LOCATION: {LOCATION}")
print(f"PREFIX: {PREFIX}")

PROJECT_ID: hybrid-vertex
LOCATION: us-central1
PREFIX: zghost_v1


### imports

In [None]:
import json
import time
import pandas as pd
from pprint import pprint

from IPython.display import HTML, Markdown, display
from google.cloud import bigquery, secretmanager
import googleapiclient.discovery
import googleapiclient.errors

import vertexai
from vertexai.batch_prediction import BatchPredictionJob
from vertexai.generative_models import GenerativeModel, Part

print(f"Vertex AI SDK version = {vertexai.__version__}")

Vertex AI SDK version = 1.85.0


### Config Gemini models

In [None]:
# Set Gemini Flash and Pro models to be used in this notebook
GEMINI_FLASH_MODEL_ID =  "gemini-1.5-flash-002" # "gemini-2.0-flash-001"
GEMINI_PRO_MODEL_ID = "gemini-1.5-pro-002"

gemini_flash_model = GenerativeModel(GEMINI_FLASH_MODEL_ID)
gemini_pro_model = GenerativeModel(GEMINI_PRO_MODEL_ID)

### YouTube Data API key

In [None]:
from google.cloud import secretmanager

# secret manager client
sm_client = secretmanager.SecretManagerServiceClient()

_SECRET_ID = 'projects/934903580331/secrets/yt-data-api'
_SECRET_VERSION = '{}/versions/1'.format(_SECRET_ID)
_SECRET_NAME = sm_client.secret_path(PROJECT_ID, _SECRET_ID)
print(f"_SECRET_NAME: {_SECRET_NAME}\n")

response = sm_client.access_secret_version(request={"name": _SECRET_VERSION})
# print(f"response: {response}")

YOUTUBE_DATA_API_KEY = response.payload.data.decode("UTF-8")
# print(f"YOUTUBE_DATA_API_KEY: {YOUTUBE_DATA_API_KEY}")

_SECRET_NAME: projects/hybrid-vertex/secrets/projects/934903580331/secrets/yt-data-api



### Setup BigQuery client, dataset, and tables

In [None]:
# Create BQ client
BQ_CLIENT = bigquery.Client(project=PROJECT_ID)

# Function to run BQ query and return results as data frame
def get_bq_query_results_as_df(query_text):
    bq_results_table = BQ_CLIENT.query(query_text).to_dataframe()
    return bq_results_table

# Names of BQ dataset and tables to be created/used
BQ_DATASET = "youtube_video_analysis"
BATCH_PREDICTION_REQUESTS_TABLE = (
    "video_analysis_batch_requests"
)
BATCH_PREDICTION_RESULTS_TABLE = (
    "video_analysis_batch_results"
)

# Create BQ dataset if it doesn't already exist
create_dataset_if_nec_query = f"""
    CREATE SCHEMA IF NOT EXISTS `{BQ_DATASET}`
    OPTIONS(
      location='{LOCATION}'
    );
    """

get_bq_query_results_as_df(create_dataset_if_nec_query)

# YouTube Data API 

* [API Reference](https://developers.google.com/youtube/v3/docs)

## find videos by query

In [None]:
search_query = "trailer park boys"

video_duration_type = (
    "short"  # ['any', 'long', 'medium', 'short']
)

# To get newer/fresher videos, modify to lower # of days
published_within_last_X_days = 30 

# Different ways to order results
order_criteria = "relevance"  # ['date', 'rating', 'relevance', 'title', 'viewCount']

# of results to be returned - max is 50 results on 1 API call
num_results = 3

print(f"search_query: {search_query}")
print(f"video_duration_type: {video_duration_type}")
print(f"published_within_last_X_days: {published_within_last_X_days}")
print(f"order_criteria: {order_criteria}")
print(f"num_results: {num_results}")

search_query: trailer park boys
video_duration_type: short
published_within_last_X_days: 30
order_criteria: relevance
num_results: 4


In [None]:
def get_yt_data_api_response_for_search_query(
    query, 
    video_duration, 
    max_num_days_ago,
    channel_id, 
    video_order, 
    num_video_results
):
    api_service_name = "youtube"
    api_version = "v3"
    developer_key = YOUTUBE_DATA_API_KEY
    youtube = googleapiclient.discovery.build(
        api_service_name, api_version, developerKey=developer_key
    )
    
    published_after_timestamp = (
        (pd.Timestamp.now() - pd.DateOffset(days=max_num_days_ago))
        .tz_localize("UTC")
        .isoformat()
    )

    # Using Search:list - https://developers.google.com/youtube/v3/docs/search/list
    yt_data_api_request = youtube.search().list(
        part="id,snippet",
        type="video",
        q=query,
        videoDuration=video_duration,
        maxResults=num_video_results,
        publishedAfter=published_after_timestamp,
        channelId=channel_id,
        order=video_order,
    )
    yt_data_api_response = yt_data_api_request.execute()

    return yt_data_api_response

In [23]:
yt_data_api_results = get_yt_data_api_response_for_search_query(
    query=search_query,
    video_duration=video_duration_type,
    max_num_days_ago=published_within_last_X_days,
    channel_id=None,
    video_order=order_criteria,
    num_video_results=num_results,
)

print(yt_data_api_results)

{'kind': 'youtube#searchListResponse', 'etag': 'pdonbvkjq3OfwogTrzdXnEQsRg8', 'nextPageToken': 'CAQQAA', 'regionCode': 'ZZ', 'pageInfo': {'totalResults': 5381, 'resultsPerPage': 4}, 'items': [{'kind': 'youtube#searchResult', 'etag': 'Q48eg7EnewvJJe_vEz33obHOivU', 'id': {'kind': 'youtube#video', 'videoId': 'o8iYmeqXU20'}, 'snippet': {'publishedAt': '2025-03-18T16:00:06Z', 'channelId': 'UCvW9uSNy6Lytcnib1CdXrow', 'title': 'Gettin&#39; Cooked With Ricky - Sneak Preview!', 'description': "Where the best setting is BAKED... Gettin' Cooked With Ricky launches on SwearNet Friday, March 28! #trailerparkboys ...", 'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/o8iYmeqXU20/default.jpg', 'width': 120, 'height': 90}, 'medium': {'url': 'https://i.ytimg.com/vi/o8iYmeqXU20/mqdefault.jpg', 'width': 320, 'height': 180}, 'high': {'url': 'https://i.ytimg.com/vi/o8iYmeqXU20/hqdefault.jpg', 'width': 480, 'height': 360}}, 'channelTitle': 'Trailer Park Boys', 'liveBroadcastContent': 'none', 'publi

In [86]:
yt_data_api_results

{'kind': 'youtube#searchListResponse',
 'etag': 'pdonbvkjq3OfwogTrzdXnEQsRg8',
 'nextPageToken': 'CAQQAA',
 'regionCode': 'ZZ',
 'pageInfo': {'totalResults': 5381, 'resultsPerPage': 4},
 'items': [{'kind': 'youtube#searchResult',
   'etag': 'Q48eg7EnewvJJe_vEz33obHOivU',
   'id': {'kind': 'youtube#video', 'videoId': 'o8iYmeqXU20'},
   'snippet': {'publishedAt': '2025-03-18T16:00:06Z',
    'channelId': 'UCvW9uSNy6Lytcnib1CdXrow',
    'title': 'Gettin&#39; Cooked With Ricky - Sneak Preview!',
    'description': "Where the best setting is BAKED... Gettin' Cooked With Ricky launches on SwearNet Friday, March 28! #trailerparkboys ...",
    'thumbnails': {'default': {'url': 'https://i.ytimg.com/vi/o8iYmeqXU20/default.jpg',
      'width': 120,
      'height': 90},
     'medium': {'url': 'https://i.ytimg.com/vi/o8iYmeqXU20/mqdefault.jpg',
      'width': 320,
      'height': 180},
     'high': {'url': 'https://i.ytimg.com/vi/o8iYmeqXU20/hqdefault.jpg',
      'width': 480,
      'height': 360}},

## Convert response to dataframe

In [24]:
def convert_yt_data_api_response_to_df(yt_data_api_response):

    # Convert API response into data frame for further analysis
    yt_data_api_response_items_df = pd.json_normalize(yt_data_api_response["items"])

    yt_data_api_response_df = yt_data_api_response_items_df.assign(
        videoURL="https://www.youtube.com/watch?v="
        + yt_data_api_response_items_df["id.videoId"]
    )[
        [
            "id.videoId",
            "videoURL",
            "snippet.title",
            "snippet.description",
            "snippet.channelId",
            "snippet.channelTitle",
            "snippet.publishedAt",
            "snippet.thumbnails.default.url",
        ]
    ].rename(
        columns={
            "id.videoId": "videoId",
            "snippet.title": "videoTitle",
            "snippet.description": "videoDescription",
            "snippet.channelId": "channelId",
            "snippet.channelTitle": "channelTitle",
            "snippet.publishedAt": "publishedAt",
            "snippet.thumbnails.default.url": "thumbnailURL",
        }
    )

    return yt_data_api_response_df

In [25]:
yt_data_api_results_df = convert_yt_data_api_response_to_df(yt_data_api_results)

display(yt_data_api_results_df.head())

Unnamed: 0,videoId,videoURL,videoTitle,videoDescription,channelId,channelTitle,publishedAt,thumbnailURL
0,o8iYmeqXU20,https://www.youtube.com/watch?v=o8iYmeqXU20,Gettin&#39; Cooked With Ricky - Sneak Preview!,Where the best setting is BAKED... Gettin' Coo...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-03-18T16:00:06Z,https://i.ytimg.com/vi/o8iYmeqXU20/default.jpg
1,Aei-BO5SmLM,https://www.youtube.com/watch?v=Aei-BO5SmLM,Park After Dark S6E43 - Theory Of Fuckativity,Now streaming at https://bit.ly/PAD6-ep43 and ...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-03-14T11:30:06Z,https://i.ytimg.com/vi/Aei-BO5SmLM/default.jpg
2,5tZkwb3bsmw,https://www.youtube.com/watch?v=5tZkwb3bsmw,Park After Dark S6E41 - The Fuck You Stick,Now streaming at https://bit.ly/PAD6-ep41 and ...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-02-28T12:19:32Z,https://i.ytimg.com/vi/5tZkwb3bsmw/default.jpg
3,E55Pw55MvGA,https://www.youtube.com/watch?v=E55Pw55MvGA,Park After Dark S6E44 - Gimme The F**king Liquor!,Now streaming at https://bit.ly/PAD6-ep44 and ...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-03-25T01:15:01Z,https://i.ytimg.com/vi/E55Pw55MvGA/default.jpg


## Get summary from Gemini for each video

In [None]:
def get_gemini_summary_from_youtube_video_url(video_url):
    video_summary_prompt = "Summarize this video."

    # Gemini Pro for highest quality (change to Flash if latency/cost are of concern)
    video_summary_response = gemini_pro_model.generate_content(
        [
            video_summary_prompt, 
            Part.from_uri(mime_type="video/webm", uri=video_url)
        ]
    )

    summary_text = video_summary_response.text

    return summary_text

In [27]:
yt_data_api_results_df["geminiVideoSummary"] = yt_data_api_results_df["videoURL"].apply(
    get_gemini_summary_from_youtube_video_url
)

yt_data_api_results_df

Unnamed: 0,videoId,videoURL,videoTitle,videoDescription,channelId,channelTitle,publishedAt,thumbnailURL,geminiVideoSummary
0,o8iYmeqXU20,https://www.youtube.com/watch?v=o8iYmeqXU20,Gettin&#39; Cooked With Ricky - Sneak Preview!,Where the best setting is BAKED... Gettin' Coo...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-03-18T16:00:06Z,https://i.ytimg.com/vi/o8iYmeqXU20/default.jpg,This trailer introduces “Gettin’ Cooked with R...
1,Aei-BO5SmLM,https://www.youtube.com/watch?v=Aei-BO5SmLM,Park After Dark S6E43 - Theory Of Fuckativity,Now streaming at https://bit.ly/PAD6-ep43 and ...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-03-14T11:30:06Z,https://i.ytimg.com/vi/Aei-BO5SmLM/default.jpg,"In this clip from Trailer Park Boys, the chara..."
2,5tZkwb3bsmw,https://www.youtube.com/watch?v=5tZkwb3bsmw,Park After Dark S6E41 - The Fuck You Stick,Now streaming at https://bit.ly/PAD6-ep41 and ...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-02-28T12:19:32Z,https://i.ytimg.com/vi/5tZkwb3bsmw/default.jpg,Three men discuss using a Pringle’s can as a s...
3,E55Pw55MvGA,https://www.youtube.com/watch?v=E55Pw55MvGA,Park After Dark S6E44 - Gimme The F**king Liquor!,Now streaming at https://bit.ly/PAD6-ep44 and ...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-03-25T01:15:01Z,https://i.ytimg.com/vi/E55Pw55MvGA/default.jpg,This Trailer Park Boys ad features a three-pac...


In [None]:
pprint(yt_data_api_results_df['geminiVideoSummary'].iloc[0])


('This trailer introduces “Gettin’ Cooked with Ricky” and Randy. The show is '
 'about cooking when you’re high. They make several dishes, including donair '
 'spaghetti, cheeseburger meatloaf, and pickle pizza. The cooking is often '
 'haphazard and involves unconventional kitchen implements, like a drill as a '
 'mixer, and a reciprocating saw to cut cheese. The trailer contains much '
 'profanity. The show is slated to start on March 28th.')


In [32]:
# Pick 1 video above to display video and its summary together
sample_video = yt_data_api_results_df.sample(1).iloc[0].to_dict()
sample_video_embed_url = sample_video["videoURL"].replace("/watch?v=", "/embed/")

# Create HTML code to directly embed video
sample_video_embed_html_code = f"""


"""

# Display embedded YouTube video
display(HTML(sample_video_embed_html_code))

display(
    Markdown(
        f"Summary of Video from Gemini:{sample_video['geminiVideoSummary']}"
    )
)

Summary of Video from Gemini:This trailer introduces “Gettin’ Cooked with Ricky” and Randy. The show is about cooking when you’re high. They make several dishes, including donair spaghetti, cheeseburger meatloaf, and pickle pizza. The cooking is often haphazard and involves unconventional kitchen implements, like a drill as a mixer, and a reciprocating saw to cut cheese. The trailer contains much profanity. The show is slated to start on March 28th.

# Analyze larger set of video in batch

return top 50 videos from a given channel ID (e.g., `UCvW9uSNy6Lytcnib1CdXrow`)

to get channel ID (manually):
* Browse to the channel page
* Press Ctrl-U to view source
* Search for `<link rel="canonical" href="https://www.youtube.com/channel/UC` (external IDs for normal channels always start with UC). Or simply `/UC`, but there could be multiple values; the one in `<link rel="canonical" href="https://www.youtube.com/channel/UC[...]">` should reliably give the unique ID.

In [40]:
# Intentionally leaving default empty to search for all videos w/in a channel
search_query = ""

video_duration_type = (
    "any" # ['any', 'long', 'medium', 'short']
)

published_within_last_X_days = 365

# for [Trailer Park Boys](https://www.youtube.com/@trailerparkboys)
channel_id = "UCvW9uSNy6Lytcnib1CdXrow"

order_criteria = "viewCount" # ['date', 'rating', 'relevance', 'title', 'viewCount']

# Max is 50 results on 1 API call
num_results = 50

print(f"search_query: {search_query}")
print(f"video_duration_type: {video_duration_type}")
print(f"published_within_last_X_days: {published_within_last_X_days}")
print(f"order_criteria: {order_criteria}")
print(f"num_results: {num_results}")

search_query: 
video_duration_type: any
published_within_last_X_days: 365
order_criteria: viewCount
num_results: 50


In [43]:
yt_data_api_channel_results = get_yt_data_api_response_for_search_query(
    query=search_query,
    video_duration=video_duration_type,
    max_num_days_ago=published_within_last_X_days,
    channel_id=channel_id,
    video_order=order_criteria,
    num_video_results=num_results,
)

yt_data_api_channel_results_df = convert_yt_data_api_response_to_df(
    yt_data_api_channel_results
)

print(f"dataframe shape: {yt_data_api_channel_results_df.shape}")
display(yt_data_api_channel_results_df.head())

dataframe shape: (50, 8)


Unnamed: 0,videoId,videoURL,videoTitle,videoDescription,channelId,channelTitle,publishedAt,thumbnailURL
0,OnMX4fIgSno,https://www.youtube.com/watch?v=OnMX4fIgSno,Trailer Park Boys Podcast Episode 55 - Ricky I...,Another vintage Trailer Park Boys Podcast - wi...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2024-07-04T17:04:40Z,https://i.ytimg.com/vi/OnMX4fIgSno/default.jpg
1,PhIzGATStls,https://www.youtube.com/watch?v=PhIzGATStls,Bubbles And The Shitrockers - I Only Got Eyes ...,Country music just got more DECENT!! Bubbles a...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2024-09-27T16:00:33Z,https://i.ytimg.com/vi/PhIzGATStls/default.jpg
2,mK5oSQObfSg,https://www.youtube.com/watch?v=mK5oSQObfSg,Standing On The Shoulders Of Kitties - Now ava...,Standing On The Shoulders Of Kitties now avail...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2024-10-17T15:49:31Z,https://i.ytimg.com/vi/mK5oSQObfSg/default.jpg
3,VuAICCz2iBU,https://www.youtube.com/watch?v=VuAICCz2iBU,Trailer Park Boys Chips at Giant Tiger!,The Boys got their faces - and chips - on the ...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2024-06-14T14:11:36Z,https://i.ytimg.com/vi/VuAICCz2iBU/default.jpg
4,o8iYmeqXU20,https://www.youtube.com/watch?v=o8iYmeqXU20,Gettin&#39; Cooked With Ricky - Sneak Preview!,Where the best setting is BAKED... Gettin' Coo...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-03-18T16:00:06Z,https://i.ytimg.com/vi/o8iYmeqXU20/default.jpg


##  Gemini video extraction task

Specify `system instruction`, `prompt`, and `response schema` for [controlled generation](https://cloud.google.com/vertex-ai/generative-ai/docs/multimodal/control-generated-output) (i.e. creating structured outputs for further analysis)

Create single Gemini cURL request per row - 1 for each YouTube video - in order to set up for using batch prediction.

In [None]:
# Set up pieces (system instruction, prompt, response schema, config) for Gemini video extraction API calls

video_extraction_system_instruction = """You are a video analyst that carefully looks 
    through all frames of provided videos, extracting out the pieces necessary to respond to
    user prompts. Make sure to look through and listen to the whole video, start to finish.
    Only reference information in the video itself in your response."""

video_extraction_prompt = """Provide a 2-3 sentence summary of the key themes from this video,
    and also provide a list of each character, brand, and location that is referenced or shown.
    Make sure to count only those involved in the actual video, and output only 1 entity per row."""


video_extraction_response_schema = {
    "type": "array",
    "items": {
        "type": "object",
        "properties": {
            "summary": {"type": "string"},
            "references": {
                "type": "array",
                "items": {
                    "type": "object",
                    "properties": {
                        "entity_name": {"type": "string"},
                        "entity_type": {
                            "type": "string",
                            "enum": ["character", "brand", "location"],
                        },
                    },
                },
            },
        },
    },
}

video_extraction_generation_config = {
    "temperature": 0.0,
    "max_output_tokens": 8192,
    "response_mime_type": "application/json",
    "response_schema": video_extraction_response_schema,
}

# Function to build CURL request for given YT link, using pieces above
def get_video_extraction_curl_request_for_yt_video_link(youtube_video_link):
    video_extraction_curl_request_dict = {
        "system_instruction": {
            "parts": [{"text": video_extraction_system_instruction}]
        },
        "contents": [
            {
                "role": "user",
                "parts": [
                    {"text": video_extraction_prompt},
                    {
                        "file_data": {
                            "mimeType": "video/*",
                            "fileUri": youtube_video_link,
                        }
                    },
                ],
            }
        ],
        "generation_config": video_extraction_generation_config,
    }

    video_extraction_curl_request = json.dumps(video_extraction_curl_request_dict)

    return video_extraction_curl_request

### Create Gemini API CURL request for each YT video

In [45]:
yt_data_api_channel_results_df["request"] = yt_data_api_channel_results_df.apply(
    lambda row: get_video_extraction_curl_request_for_yt_video_link(row["videoURL"]),
    axis=1,
)

display(yt_data_api_channel_results_df.head())

Unnamed: 0,videoId,videoURL,videoTitle,videoDescription,channelId,channelTitle,publishedAt,thumbnailURL,request
0,OnMX4fIgSno,https://www.youtube.com/watch?v=OnMX4fIgSno,Trailer Park Boys Podcast Episode 55 - Ricky I...,Another vintage Trailer Park Boys Podcast - wi...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2024-07-04T17:04:40Z,https://i.ytimg.com/vi/OnMX4fIgSno/default.jpg,"{""system_instruction"": {""parts"": [{""text"": ""Yo..."
1,PhIzGATStls,https://www.youtube.com/watch?v=PhIzGATStls,Bubbles And The Shitrockers - I Only Got Eyes ...,Country music just got more DECENT!! Bubbles a...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2024-09-27T16:00:33Z,https://i.ytimg.com/vi/PhIzGATStls/default.jpg,"{""system_instruction"": {""parts"": [{""text"": ""Yo..."
2,mK5oSQObfSg,https://www.youtube.com/watch?v=mK5oSQObfSg,Standing On The Shoulders Of Kitties - Now ava...,Standing On The Shoulders Of Kitties now avail...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2024-10-17T15:49:31Z,https://i.ytimg.com/vi/mK5oSQObfSg/default.jpg,"{""system_instruction"": {""parts"": [{""text"": ""Yo..."
3,VuAICCz2iBU,https://www.youtube.com/watch?v=VuAICCz2iBU,Trailer Park Boys Chips at Giant Tiger!,The Boys got their faces - and chips - on the ...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2024-06-14T14:11:36Z,https://i.ytimg.com/vi/VuAICCz2iBU/default.jpg,"{""system_instruction"": {""parts"": [{""text"": ""Yo..."
4,o8iYmeqXU20,https://www.youtube.com/watch?v=o8iYmeqXU20,Gettin&#39; Cooked With Ricky - Sneak Preview!,Where the best setting is BAKED... Gettin' Coo...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-03-18T16:00:06Z,https://i.ytimg.com/vi/o8iYmeqXU20/default.jpg,"{""system_instruction"": {""parts"": [{""text"": ""Yo..."


In [91]:
pprint(yt_data_api_channel_results_df['request'].iloc[0])
# yt_data_api_channel_results_df['request'].iloc[0]

('{"system_instruction": {"parts": [{"text": "You are a video analyst that '
 'carefully looks \\n    through all frames of provided videos, extracting out '
 'the pieces necessary to respond to\\n    user prompts. Make sure to look '
 'through and listen to the whole video, start to finish.\\n    Only reference '
 'information in the video itself in your response."}]}, "contents": [{"role": '
 '"user", "parts": [{"text": "Provide a 2-3 sentence summary of the key themes '
 'from this video,\\n    and also provide a list of each character, celebrity, '
 'and brand that is referenced or\\n    shown. Refer to people by first name '
 '(e.g., \\"Ricky\\"  instead of \\"Ricky LaFleur\\") unless their last name '
 'is provided. \\n    If a character or celebrity are referred to by first and '
 'last name, please use both of these.\\n    Make sure to count only those '
 'involved in the actual video, and output only 1 entity per row."}, '
 '{"file_data": {"mimeType": "video/*", "fileUri": '
 

### Load API responses to BigQuery

> Output table with YouTube API results and corresponding Gemini requests to BigQuery

In [57]:
yt_api_results_with_bp_requests_table_load_job = BQ_CLIENT.load_table_from_dataframe(
    yt_data_api_channel_results_df,
    f"{BQ_DATASET}.{BATCH_PREDICTION_REQUESTS_TABLE}",
    job_config=bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE"),
)

# Wait for the load job to complete
yt_api_results_with_bp_requests_table_load_job.result()



LoadJob<project=hybrid-vertex, location=us-central1, id=bd2de606-d60e-4733-bc93-27fab9afad6e>

## Submit batch prediction job to analyze multiple YouTube videos at once

You create a batch prediction job using the `BatchPredictionJob.submit()` method ([src](https://github.com/googleapis/python-aiplatform/blob/main/google/cloud/aiplatform/jobs.py#L321)), and specifying the `source model ID`, `input source`, and `output location` - either Cloud Storage or BigQuery. To learn more, see the [batch prediction API page](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/batch-prediction-api).

Below, we'll use the BigQuery table with requests created in the previous section as input, and output results to another BigQuery table for further analysis.

In [58]:
# BQ URI of input table in form bq://PROJECT_ID.DATASET.TABLE
# or Cloud Storage bucket URI
INPUT_URI = f"bq://{PROJECT_ID}.{BQ_DATASET}.{BATCH_PREDICTION_REQUESTS_TABLE}"

# BQ URI of target output table in form bq://PROJECT_ID.DATASET.TABLE
# If the table doesn't already exist, then it is created for you
OUTPUT_URI = f"bq://{PROJECT_ID}.{BQ_DATASET}.{BATCH_PREDICTION_RESULTS_TABLE}"

# Pick which Gemini model to use here (default Flash)
MODEL_ID = GEMINI_FLASH_MODEL_ID  # ['GEMINI_FLASH_MODEL_ID', 'GEMINI_PRO_MODEL_ID']

print(f"INPUT_URI: {INPUT_URI}")
print(f"OUTPUT_URI: {OUTPUT_URI}")
print(f"MODEL_ID: {MODEL_ID}")

INPUT_URI: bq://hybrid-vertex.youtube_video_analysis.video_analysis_batch_requests
OUTPUT_URI: bq://hybrid-vertex.youtube_video_analysis.video_analysis_batch_results
MODEL_ID: gemini-1.5-flash-002


### Submit batch prediction request using Vertex AI SDK

If the batch prediction job goes through, the output above should contain a link you can use to monitor the job in the [Vertex AI Batch predictions page](https://console.cloud.google.com/vertex-ai/batch-predictions) in the Google Cloud console.

In [59]:
# Submit batch prediction request using Vertex AI SDK
batch_prediction_job = BatchPredictionJob.submit(
    source_model=MODEL_ID, 
    input_dataset=INPUT_URI, 
    output_uri_prefix=OUTPUT_URI
)

BatchPredictionJob created. Resource name: projects/934903580331/locations/us-central1/batchPredictionJobs/3824643792695197696
To use this BatchPredictionJob in another session:
job = batch_prediction.BatchPredictionJob('projects/934903580331/locations/us-central1/batchPredictionJobs/3824643792695197696')
View Batch Prediction Job:
https://console.cloud.google.com/ai/platform/locations/us-central1/batch-predictions/3824643792695197696?project=934903580331


You can also print out the job status and other properties:

In [69]:
print(f"Job resource name: {batch_prediction_job.resource_name}")
print(f"Model resource name: {batch_prediction_job.model_name}")
print(f"Job state: {batch_prediction_job.state.name}")

Job resource name: projects/934903580331/locations/us-central1/batchPredictionJobs/3824643792695197696
Model resource name: publishers/google/models/gemini-1.5-flash-002
Job state: JOB_STATE_RUNNING


### check job status

In [70]:
# Refresh batch prediction job until complete
while not batch_prediction_job.has_ended:
    time.sleep(5)
    batch_prediction_job.refresh()

# Check if the job succeeds
if batch_prediction_job.has_succeeded:
    print("Job succeeded!")
else:
    print(f"Job failed: {batch_prediction_job.error}")

Job succeeded!


## Check sample of results in BigQuery

> Once the batch prediction job has finished successfully, we can run the following cell to check a sample of our results

In [71]:
# Pick sampling % and # of results for check of BQ results table - can leave
# 100% & total # of results for big tables, likely sample down for larger ones

sampling_percentage = 100

num_results = 50

batch_prediction_results_sample_query = f"""
    SELECT * 
    FROM `{BQ_DATASET}.{BATCH_PREDICTION_RESULTS_TABLE}`
    TABLESAMPLE SYSTEM ({sampling_percentage} PERCENT)
    LIMIT {num_results}
    """

bq_results_table = get_bq_query_results_as_df(batch_prediction_results_sample_query)

display(Markdown("Batch Prediction BigQuery Results Table"))

display(bq_results_table.head())



Batch Prediction BigQuery Results Table

Unnamed: 0,videoId,videoURL,videoTitle,videoDescription,channelId,channelTitle,publishedAt,thumbnailURL,status,processed_time,request,response
0,o8iYmeqXU20,https://www.youtube.com/watch?v=o8iYmeqXU20,Gettin&#39; Cooked With Ricky - Sneak Preview!,Where the best setting is BAKED... Gettin' Coo...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-03-18T16:00:06Z,https://i.ytimg.com/vi/o8iYmeqXU20/default.jpg,,2025-03-26 12:42:34.528000+00:00,"{""contents"":[{""parts"":[{""text"":""Provide a 2-3 ...","{""candidates"":[{""avgLogprobs"":-0.0648541268848..."
1,39f-BtsFffc,https://www.youtube.com/watch?v=39f-BtsFffc,Bubbles&#39; Lost Pet Appeal - Who&#39;s Getti...,Bubbles needs your help! Send a photo or video...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2024-10-25T13:00:04Z,https://i.ytimg.com/vi/39f-BtsFffc/default.jpg,,2025-03-26 12:42:38.573000+00:00,"{""contents"":[{""parts"":[{""text"":""Provide a 2-3 ...","{""candidates"":[{""avgLogprobs"":-0.0811032310861..."
2,RXr4SAfR8pQ,https://www.youtube.com/watch?v=RXr4SAfR8pQ,Bubbles has a message for ya! #bubblesandthesh...,'I Only Got Eyes For You' now streaming on you...,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2024-10-03T17:00:32Z,https://i.ytimg.com/vi/RXr4SAfR8pQ/default.jpg,,2025-03-26 12:42:38.495000+00:00,"{""contents"":[{""parts"":[{""text"":""Provide a 2-3 ...","{""candidates"":[{""avgLogprobs"":-0.0277483045718..."
3,6NQCPCsO4g0,https://www.youtube.com/watch?v=6NQCPCsO4g0,Watch a Special Park After Dark on Christmas Day!,"Unwrap your presents, get your morning liquor ...",UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2024-12-24T18:45:00Z,https://i.ytimg.com/vi/6NQCPCsO4g0/default.jpg,,2025-03-26 12:42:34.486000+00:00,"{""contents"":[{""parts"":[{""text"":""Provide a 2-3 ...","{""candidates"":[{""avgLogprobs"":-0.0996898842728..."
4,XpqlE5BDFUU,https://www.youtube.com/watch?v=XpqlE5BDFUU,Grocery Gank Gunfight #trailerparkboys #jroc,Just another day in Sunnyvale Trailer Park!,UCvW9uSNy6Lytcnib1CdXrow,Trailer Park Boys,2025-01-30T16:00:38Z,https://i.ytimg.com/vi/XpqlE5BDFUU/default.jpg,,2025-03-26 12:42:36.252000+00:00,"{""contents"":[{""parts"":[{""text"":""Provide a 2-3 ...","{""candidates"":[{""avgLogprobs"":-0.1039502750743..."


## Further analysis of Gemini video extraction results

With our results from Gemini video extraction in BigQuery, we can pull out various pieces that might interest us. It's possible to do this further analysis in Python or directly in BigQuery - we'll choose the latter here since the results are already there, and [BigQuery's native JSON functionality](https://cloud.google.com/bigquery/docs/reference/standard-sql/json_functions) provides convenient ways to pull out relevant outputs at scale.

### Extract summaries for each YouTube video

In [72]:

# Query to extract summary for each video from JSON Gemini API response
video_summaries_query = f"""
    SELECT
      videoUrl AS url,
      videoTitle AS title,
      videoDescription AS description,

      JSON_EXTRACT_SCALAR(
        JSON_EXTRACT_ARRAY(
          JSON_VALUE(response, '$.candidates[0].content.parts[0].text')
          )[OFFSET(0)],
        '$.summary'
        ) AS geminiSummary,

      publishedAt

    FROM
      `{BQ_DATASET}.{BATCH_PREDICTION_RESULTS_TABLE}`

    ORDER BY
      publishedAt DESC
    """

video_summaries = get_bq_query_results_as_df(video_summaries_query)

# Change column width to be able to read all summary text for each row
pd.set_option("display.max_colwidth", 500)

# Display results
display(Markdown("Batch YouTube Video Analysis Summary Results"))

display(video_summaries.head())



Batch YouTube Video Analysis Summary Results

Unnamed: 0,url,title,description,geminiSummary,publishedAt
0,https://www.youtube.com/watch?v=o8iYmeqXU20,Gettin&#39; Cooked With Ricky - Sneak Preview!,"Where the best setting is BAKED... Gettin' Cooked With Ricky launches on SwearNet Friday, March 28! #trailerparkboys ...","Ricky and Randy are co-hosting a cooking show called ""Gettin' Cooked with Ricky."" The show features the two of them making various dishes while intoxicated. The dishes include donair sauce, cheeseburger meatloaf, pickle pizza, cinnamon roll nachos, and chicken finger tacos.",2025-03-18T16:00:06Z
1,https://www.youtube.com/watch?v=zV1g0Ny7Gto,Get Ready For The Trades S2 - Todd&#39;s Recap,"You have one week to catch up #TheTrades ‍♂️ From the producers of #TrailerParkBoys, stream Season 1 of the Crave ...","This video is a recap of the last season of The Trades. Todd's sister joins the trades, and Todd tries to improve refinery productivity through automation. After a protest and a pigeon incident, Todd and Chelsea work together to meet with corporate.",2025-03-07T23:15:04Z
2,https://www.youtube.com/watch?v=e5QY44t4sxc,Scrump-dilly! #trailerparkboys #maplesyrup #canadalife,Bubbles spills his maple syrup production secrets... holy frig that looks decent.,Ricky from the Trailer Park Boys filters out bugs and other debris from a container of maple syrup. He expresses frustration with the process and the camera filming him.,2025-03-06T13:00:54Z
3,https://www.youtube.com/watch?v=Fbqk9xgOsug,Has the Green Bastard met his match?! #trailerparkboys #outofthepark #copenhagen,"Stream Out Of The Park, Trailer Park Boys Seasons 1-12 and more for just one friggin' payment of $19.99 at ...","Ricky and Brian are discussing a fight plan for the Green Bastard. The Green Bastard will fight Brian, who is a much larger opponent. The plan involves two uppercuts to the balls within 30 seconds.",2025-02-25T14:00:16Z
4,https://www.youtube.com/watch?v=utt8PCqnMnk,"Thank you, your Majesty 🚬 🤬 #trailerparkboys #canadalife",If I can't smoke and swear I'm f*****d!,This video is a clip from the Canadian television series Trailer Park Boys. Ricky is in court and is arguing with the judge about his right to smoke and swear. He is frustrated because he feels that he cannot properly defend himself without these things.,2025-02-20T14:15:06Z


In [84]:
pprint(video_summaries['geminiSummary'].iloc[0])

('Ricky and Randy are co-hosting a cooking show called "Gettin\' Cooked with '
 'Ricky."  The show features the two of them making various dishes while '
 'intoxicated.  The dishes include donair sauce, cheeseburger meatloaf, pickle '
 'pizza, cinnamon roll nachos, and chicken finger tacos.')


### Find most frequently appearing entities across videos

> In the final step of our process of going from unstructured videos to structured data results from analyzing all those videos, we'll use BigQuery to count up the number of references to each entity across videos, and return those that appear most frequently

In [85]:
# Query to extract entity references from Gemini results, count most frequently appearing
most_referenced_entities_query = f"""
    WITH
    ExtractedText AS
    (
      SELECT
        *,
        JSON_EXTRACT_ARRAY(JSON_VALUE(response, '$.candidates[0].content.parts[0].text'))[OFFSET(0)]
          AS extracted_text

      FROM
        `youtube_video_analysis.video_analysis_batch_results`
    ),

    ExtractedRows AS
    (
      SELECT
        ARRAY(
          SELECT AS STRUCT 
            JSON_EXTRACT_SCALAR(references, '$.entity_name') AS entity_name,
            JSON_EXTRACT_SCALAR(references, '$.entity_type') AS entity_type

          FROM 
            UNNEST(JSON_EXTRACT_ARRAY(extracted_text, '$.references')) AS references
          ) AS reference,

      FROM
        ExtractedText
    )

    SELECT
      References.entity_name AS name,
      LOWER(References.entity_type) AS type,
      COUNT(*) AS num_videos

    FROM
      ExtractedRows,
      UNNEST(ExtractedRows.reference) AS References

    GROUP BY
      entity_name, entity_type

    ORDER BY
      num_videos DESC,
      name
    """

most_referenced_entities = get_bq_query_results_as_df(most_referenced_entities_query)

# Display results
display(Markdown("Most Referenced Entities in Videos Analyzed"))

display(most_referenced_entities.head(25))
     



Most Referenced Entities in Videos Analyzed

Unnamed: 0,name,type,num_videos
0,Bubbles,character,24
1,Julian,character,19
2,Ricky,character,18
3,Ricky,celebrity,13
4,Randy,character,11
5,Bubbles,celebrity,10
6,Julian,celebrity,8
7,Swearnet,brand,7
8,Trailer Park Boys,brand,7
9,Billy Bob Thornton,celebrity,6
