In [57]:
# Install necessary libraries using pip.
# google-cloud-videointelligence: Library for interacting with Google Cloud's Video Intelligence API.
# This API is used to analyze videos and extract information like labels, objects, and text.
# requests: Library for making HTTP requests (used for communication with web services).
# This library is used to send data to and receive data from the Video Intelligence API.
!pip install google-cloud-videointelligence requests


# Import the os module for working with environment variables.
import os

# Set the GOOGLE_APPLICATION_CREDENTIALS environment variable.
# This variable points to the JSON file containing your Google Cloud credentials.
# These credentials are required for authenticating your requests to Google Cloud services.
# Replace '/content/munich-hackathon-2025-6e308fb8c00c.json' with the actual path to your credentials file.
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '/content/munich-hackathon-2025-6e308fb8c00c.json'  # Replace with the actual path



# Label Detection with Google Video Intelligence API and Confidence by Time Segment


In [58]:
# Import the Google Video Intelligence API client
from google.cloud import videointelligence

# Create a Video Intelligence client to communicate with the API
video_client = videointelligence.VideoIntelligenceServiceClient()

# Define what we want the API to detect: LABEL_DETECTION (objects/concepts)
features = [videointelligence.Feature.LABEL_DETECTION]

# Start an asynchronous request to analyze the video for labels
operation = video_client.annotate_video(
    request={
        "features": features,
        # Video location in Google Cloud Storage (GCS)
        # Only one 'input_uri' should be active at a time
        "input_uri": "gs://pjs-munich-datathon-bucket/input/dreamfall.mov",
    }
)

print("\nProcessing video for label annotations:")

# Wait (up to 180 seconds) for the video analysis to complete
result = operation.result(timeout=180)
print("\nFinished processing.")

# === Extract and Print Results ===

# Access the first annotation result (because only one video was analyzed)
segment_labels = result.annotation_results[0].segment_label_annotations

# Loop over all detected segment labels
for i, segment_label in enumerate(segment_labels):
    # Print the label's description (e.g., "dog", "concert", "Italy")
    print("Video label description: {}".format(segment_label.entity.description))

    # Print higher-level category descriptions if available (e.g., "animal", "event")
    for category_entity in segment_label.category_entities:
        print("\tLabel category description: {}".format(category_entity.description))

    # Loop through all time segments where this label was detected
    for i, segment in enumerate(segment_label.segments):
        # Calculate segment start and end times (in seconds)
        start_time = (
            segment.segment.start_time_offset.seconds
            + segment.segment.start_time_offset.microseconds / 1e6
        )
        end_time = (
            segment.segment.end_time_offset.seconds
            + segment.segment.end_time_offset.microseconds / 1e6
        )
        positions = "{}s to {}s".format(start_time, end_time)

        # Retrieve the confidence score for this segment detection
        confidence = segment.confidence

        # Print the time range and the confidence level
        print("\tSegment {}: {}".format(i, positions))
        print("\tConfidence: {}".format(confidence))
    print("\n")


Processing video for label annotations:

Finished processing.
Video label description: games
	Segment 0: 0.0s to 265.2s
	Confidence: 0.6059064865112305


Video label description: video game
	Segment 0: 0.0s to 265.2s
	Confidence: 0.48314234614372253




#  Label and Segment Extraction

In [59]:
# Import the Google Video Intelligence client library
from google.cloud import videointelligence

# Create a client to interact with the Video Intelligence API
client = videointelligence.VideoIntelligenceServiceClient()

# Define the feature we want to use: LABEL_DETECTION
features = [videointelligence.Feature.LABEL_DETECTION]

# URI of the video stored in Google Cloud Storage (GCS)
gcs_uri = "gs://pjs-munich-datathon-bucket/input/news.mp4"

# Start an asynchronous request to analyze the video
operation = client.annotate_video(
    features=features,
    input_uri=gcs_uri  # Provide the GCS URI of the video (instead of uploading raw content)
)

# Wait up to 180 seconds for the operation to complete and get the result
result = operation.result(timeout=180)



In [60]:
# Import the Google Video Intelligence client library
from google.cloud import videointelligence

# Create a client to interact with the Video Intelligence API
client = videointelligence.VideoIntelligenceServiceClient()

# Define the feature we want to use: LABEL_DETECTION
features = [videointelligence.Feature.LABEL_DETECTION]

# URI of the video stored in Google Cloud Storage (GCS)
gcs_uri = "gs://pjs-munich-datathon-bucket/input/news.mp4"

# Start an asynchronous request to analyze the video
operation = client.annotate_video(
    features=features,
    input_uri=gcs_uri  # Provide the GCS URI of the video (instead of uploading raw content)
)

# Wait up to 180 seconds for the operation to complete and get the result
result = operation.result(timeout=180)


In [61]:
# Prepare empty lists to store shot segments and entity descriptions
segments = []
entities = []
confidences = []

# Extract the annotation results (only one result usually, since one video is processed)
annotation_results = result.annotation_results

# Loop over each label annotation in the results
for label in annotation_results:
    # Go through each detected shot label
    for shot_label_annotation in label.shot_label_annotations:
        # Save all the time segments where the label was detected
        segments.append(shot_label_annotation.segments)

        # Save the description of the entity (the label name, e.g., "Italy", "Team sport")
        entities.append(shot_label_annotation.entity.description)

        # Access confidence from the segment instead of entity
        for segment in shot_label_annotation.segments:
            confidences.append(segment.confidence)

# Output the list of entity descriptions detected in the video
# Convert the zip object to a list and then slice it
for e, s, c in list(zip(entities, segments, confidences))[:10]:
    # Format the output for better readability
    print(f"Entity: {e}")
    print(f"Confidence: {c}")
    for segment in s:  # Iterate through the segments in 's'
        start_time = segment.segment.start_time_offset.seconds + segment.segment.start_time_offset.microseconds / 1e6
        end_time = segment.segment.end_time_offset.seconds + segment.segment.end_time_offset.microseconds / 1e6
        print(f"  - {start_time:.3f}s to {end_time:.3f}s")  # Format with 3 decimal places

    print("-" * 20)  # Add a separator between entries

Entity: traffic
Confidence: 0.8570924401283264
  - 253.100s to 258.560s
  - 262.260s to 263.100s
  - 263.920s to 264.300s
  - 264.720s to 265.480s
  - 270.780s to 273.800s
  - 275.580s to 279.720s
  - 279.740s to 284.000s
--------------------
Entity: eyewear
Confidence: 0.9705125093460083
  - 95.380s to 107.080s
  - 355.620s to 360.160s
--------------------
Entity: wall
Confidence: 0.9454449415206909
  - 113.980s to 116.200s
--------------------
Entity: structure
Confidence: 0.922359049320221
  - 113.980s to 116.200s
  - 546.180s to 548.240s
--------------------
Entity: sports
Confidence: 0.8570924401283264
  - 284.020s to 291.440s
  - 758.380s to 763.840s
  - 763.860s to 772.520s
  - 772.540s to 774.400s
  - 774.420s to 782.560s
  - 782.580s to 784.880s
  - 784.900s to 793.160s
  - 793.180s to 796.160s
  - 796.180s to 804.720s
  - 804.740s to 808.040s
  - 808.060s to 813.080s
  - 816.260s to 817.980s
  - 818.000s to 820.400s
--------------------
Entity: disaster
Confidence: 0.91349476