In [11]:
from google import genai
import pandas as pd
from pathlib import Path
from collections import OrderedDict
import re
import datetime
import time
import os

## Define parameters

### Define output path details

In [2]:
output_directory_path = Path('./output')

In [3]:
output_extension = 'md'

### Define model list

In [4]:
models = [
    'gemini-2.5-pro-preview-05-06',
    'gemini-2.5-flash-preview-05-20',
    'gemini-2.0-flash',
    'gemini-2.0-flash-lite',
]

### Define prompts

In [5]:
prompts = {
    'lh_prompt': """
This is a video of a student working on an 8th grade math problem. (1) Create a table summarizing each move the student makes,
the skill needed to make that move, whether the student is successful, and what that tells us about possible misconceptions or
bugs the student reveals. (2) Summarize the student’s understanding and misconceptions and what they may need more help with,
if anything.
""",
    'tq_prompt': """
This video is a screen capture of a student taking a math lesson. Based on the student's interaction with this lesson,
what should the teacher do to help this student learn?

Consider things like:
* How is the student interacting with the lesson? Do they seem engaged? How do they react when they face difficulty?
* What concepts and procedures does the student appear to have mastered?
* What concepts and procedures does the student struggle with? What appears to be the nature of their misunderstanding?
* What appears to be their state of engagement and understanding by the end of this interaction?
* Given all of the above, what should the teacher do to help accelerate this student's learning (not just give them the answer)?

Please give your response in Markdown format.
"""
}

### List input videos

In [6]:
video_info = {
    'lh_video': {
        'path': Path('../videos/Zearn Screen Recording 2025-04-25 at 7.55.09 AM.mp4'),
        'duration_minutes': 3.0 + 53/60,
        'size_mb': 23.3,
    },
    'tq_video':  {
        'path': Path('../videos/zearn_g8m2l4_brief_confusion.mp4'),
        'duration_minutes': 2.0 + 19/60,
        'size_mb': 5.1
    }
}

## Initialize Gemini client

In [7]:
api_key = os.getenv('GEMINI_API_KEY')
if api_key is None:
    raise ValueError ('Gemini API key not found in environment variables')

In [8]:
client = genai.Client(api_key=api_key)

## Upload videos to Gemini

In [9]:
videos = dict()
for video_identifier, video_metadata in video_info.items():
    videos[video_identifier] = client.files.upload(file=video_metadata['path'])
    while True:
        videos[video_identifier] = client.files.get(name=videos[video_identifier].name)
        print(f"{video_identifier}: {videos[video_identifier].state}")
        if videos[video_identifier].state.name == "ACTIVE":
            break
        time.sleep(2)


lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: PROCESSING
lh_video: ACTIVE
tq_video: PROCESSING
tq_video: PROCESSING
tq_video: PROCESSING
tq_video: ACTIVE


## Analyze videos

In [10]:
metadata_list = list()
for video_identifier, video in videos.items():
    print(f"Video: {video_identifier}")
    for prompt_identifier, prompt in prompts.items():
        print(f" Prompt: {prompt_identifier}")
        for model in models:
                print(f"  Model: {model}")
                text_prompt_count_tokens_response = client.models.count_tokens(
                    model=model,
                    contents=prompt,
                )
                video_prompt_count_tokens_response = client.models.count_tokens(
                    model=model,
                    contents=video,
                )
                generate_content_response = client.models.generate_content(
                    model=model,
                    contents=[
                        videos[video_identifier],
                        prompts[prompt_identifier],
                    ],
                )
                output_path = (
                    output_directory_path /
                    f"{video_identifier}_{prompt_identifier}_{model}.{output_extension}"
                )
                with open(output_path, 'w') as fp:
                    fp.write(generate_content_response.text)
                metadata = OrderedDict([
                    ('video', video_identifier),
                    ('prompt', prompt_identifier),
                    ('model', model),
                    ('video_local_path', str(video_info[video_identifier]['path'])),
                    ('video_remote_path', videos[video_identifier].name),
                    ('video_duration_minutes', video_info[video_identifier]['duration_minutes']),
                    ('video_duration_minutes_returned',int(videos[video_identifier].video_metadata['videoDuration'][:-1])/60.0),
                    ('video_size_mb', video_info[video_identifier]['size_mb']),
                    ('video_size_mb_returned', videos[video_identifier].size_bytes/1024**2),
                    ('prompt_tokens_precalculated', text_prompt_count_tokens_response.total_tokens),
                    ('video_tokens_precalculated', video_prompt_count_tokens_response.total_tokens),
                    ('prompt_token_count', generate_content_response.usage_metadata.prompt_token_count),
                    ('thoughts_token_count', generate_content_response.usage_metadata.thoughts_token_count),
                    ('candidates_token_count', generate_content_response.usage_metadata.candidates_token_count),
                    ('total_token_count', generate_content_response.usage_metadata.total_token_count),
                ])
                for prompt_tokens_detail in generate_content_response.usage_metadata.prompt_tokens_details:
                    if prompt_tokens_detail.modality == 'TEXT':
                        metadata['prompt_tokens_detail_text'] = prompt_tokens_detail.token_count
                    elif prompt_tokens_detail.modality == 'VIDEO':
                        metadata['prompt_tokens_detail_video'] = prompt_tokens_detail.token_count
                    elif prompt_tokens_detail.modality == 'AUDIO':
                        metadata['prompt_tokens_detail_audio'] = prompt_tokens_detail.token_count
                    else:
                        pass
                metadata_list.append(metadata)

Video: lh_video
 Prompt: lh_prompt
  Model: gemini-2.5-pro-preview-05-06
  Model: gemini-2.5-flash-preview-05-20
  Model: gemini-2.0-flash
  Model: gemini-2.0-flash-lite
 Prompt: tq_prompt
  Model: gemini-2.5-pro-preview-05-06
  Model: gemini-2.5-flash-preview-05-20
  Model: gemini-2.0-flash
  Model: gemini-2.0-flash-lite
Video: tq_video
 Prompt: lh_prompt
  Model: gemini-2.5-pro-preview-05-06
  Model: gemini-2.5-flash-preview-05-20
  Model: gemini-2.0-flash
  Model: gemini-2.0-flash-lite
 Prompt: tq_prompt
  Model: gemini-2.5-pro-preview-05-06
  Model: gemini-2.5-flash-preview-05-20
  Model: gemini-2.0-flash
  Model: gemini-2.0-flash-lite


## Save metadata

In [12]:
metadata_table = pd.DataFrame(metadata_list)

In [14]:
timestamp = (
    datetime.datetime
    .now(tz=datetime.timezone.utc)
    .strftime('%Y%m%d_%H%M%S')
)

In [15]:
metadata_table.to_pickle(output_directory_path / f"gemini_analysis_metadata_{timestamp}.pkl")

In [16]:
metadata_table.to_csv(output_directory_path / f"gemini_analysis_metadata_{timestamp}.csv")