### Grab influx of Comments

In [None]:
import pandas as pd
import numpy as np
import re
from datetime import timedelta
### Todo store number of comments per spike 
### Todo store the comments


# --- Parameters ---
csv_file = "USCvsNois_video.csv"  # your csv
window_minutes = 2
threshold_multiplier = 2.0  # e.g., spike = 2x baseline
slide_seconds = 30          # slide every 30 seconds

# --- Load ---
df = pd.read_csv(csv_file)

# --- Robust timestamp parsing (supports mm:ss, hh:mm:ss, fractional seconds, large mins/hours) ---
def _parse_to_seconds(val):
    s = str(val).strip()
    if not s or s.lower() in {"nan", "nat"}:
        return np.nan

    # Remove common noise around AM/PM while keeping numbers/colons/dots intact
    s_clean = re.sub(r"\s+", "", s)
    s_clean = re.sub(r"(?i)\b(am|pm)\b", "", s_clean)

    # hh:mm:ss(.frac)
    m_hms = re.match(r"^(?P<h>\d+):(?P<m>\d{1,2}):(?P<s>\d{1,2}(?:\.\d+)?)$", s_clean)
    if m_hms:
        h = int(m_hms.group("h"))
        m = int(m_hms.group("m"))
        sec = float(m_hms.group("s"))
        return h * 3600 + m * 60 + sec

    # mm:ss(.frac) — treat first as minutes (can exceed 59)
    m_ms = re.match(r"^(?P<m>\d+):(?P<s>\d{1,2}(?:\.\d+)?)$", s_clean)
    if m_ms:
        m = int(m_ms.group("m"))
        sec = float(m_ms.group("s"))
        return m * 60 + sec

    # hh:mm (no seconds) — interpret as hours:minutes
    m_hm = re.match(r"^(?P<h>\d+):(?P<m>\d{1,2})$", s_clean)
    if m_hm:
        h = int(m_hm.group("h"))
        m = int(m_hm.group("m"))
        return h * 3600 + m * 60

    # Plain number -> seconds
    m_num = re.match(r"^\d+(?:\.\d+)?$", s_clean)
    if m_num:
        return float(s_clean)

    # Fallback: try pandas datetime and take time-of-day
    try:
        dt = pd.to_datetime(s_clean, errors="raise")
        return (
            dt.hour * 3600
            + dt.minute * 60
            + dt.second
            + dt.microsecond / 1e6
        )
    except Exception:
        return np.nan

# Vectorized parse
secs = pd.Series(( _parse_to_seconds(v) for v in df["timestamp"] ), index=df.index, dtype="float64")
parsed_ok = secs.notna().sum()
#check if a timestap fails
parsed_fail = secs.isna().sum()
if parsed_fail:
    print(f"Warning: {parsed_fail} timestamp(s) could not be parsed. Showing up to 10 examples:")
    bad_idx = secs[secs.isna()].index[:10]
    print(df.loc[bad_idx, "timestamp"].to_string(index=False))

df["timestamp"] = pd.to_timedelta(secs, unit="s", errors="coerce")
df = df.dropna(subset=["timestamp"]).copy()

# Sort by time
df = df.sort_values("timestamp")

#check if all times are bad
if df.empty:
    raise ValueError("No valid timestamps left after parsing. Please inspect the CSV format.")

# --- Baseline engagement ---
# Use elapsed duration = max - min (not just max)
elapsed = (df["timestamp"].max() - df["timestamp"].min()).total_seconds() / 60.0
if elapsed <= 0:
    raise ValueError("Non-positive elapsed time computed. Are all timestamps the same?")
avg_comments_per_minute = len(df) / elapsed
print(f"Parsed timestamps: {parsed_ok}, Dropped: {parsed_fail}")
print(f"Average comments per minute: {avg_comments_per_minute:.2f}")

# --- Sliding window spike detection (fast) ---
window = timedelta(minutes=window_minutes)
window_sec = window.total_seconds()

# Convert to seconds array for fast search
tsec = df["timestamp"].dt.total_seconds().to_numpy()

# Generate start times for windows (in seconds)
start_min = tsec.min()
end_max = tsec.max()
if end_max - start_min < window_sec:
    # If the stream is shorter than the window, use one window
    start_grid = np.array([start_min], dtype=float)
else:
    start_grid = np.arange(start_min, end_max - window_sec + 1, slide_seconds, dtype=float)

# Count comments in each window using searchsorted (O(N log N))
starts_idx = np.searchsorted(tsec, start_grid, side="left")
ends_idx = np.searchsorted(tsec, start_grid + window_sec, side="left")
counts = ends_idx - starts_idx
rates = counts / window_minutes

threshold = avg_comments_per_minute * threshold_multiplier
spike_mask = rates > threshold

# Build segments (merge contiguous/overlapping windows into longer segments)
segments = []
current_start = None
current_end = None

for sg, is_spike in zip(start_grid, spike_mask):
    s = pd.to_timedelta(sg, unit="s")
    e = s + window
    if is_spike:
        if current_start is None:
            current_start, current_end = s, e
        else:
            # If this window overlaps/touches the previous, extend
            if s <= current_end:
                current_end = max(current_end, e)
            else:
                segments.append((current_start, current_end))
                current_start, current_end = s, e
    else:
        if current_start is not None:
            segments.append((current_start, current_end))
            current_start, current_end = None, None

# Flush last open segment
if current_start is not None:
    segments.append((current_start, current_end))

# --- Output ---
print("Highlight Segments:")
for seg in segments:
    print(seg)

In [None]:
from google import genai
from google.genai import types
import os
import pandas as pd
import time #time.sleep(60) between requests if you hit rate limits

def gen_caption(video_file_name="USC_NOIS_CLIPS/clip1.mp4", model="models/gemini-2.5-flash-lite", max_comments=2, comment="its dabover"):
    # Only for videos of size <20Mb
    video_bytes = open(video_file_name, 'rb').read() #the sports clip 
    client = genai.Client() #set conda env variable GOOGLE_API_KEY to your API key
    response = client.models.generate_content(
        model=model,
        contents=types.Content(
            parts=[
                types.Part(
                    inline_data=types.Blob(data=video_bytes, mime_type='video/mp4')
                ),
                types.Part(text=f'Ignore all non sports knowledge Act like a really funny sports analyst. Be Concise. Begin by stating the game time and quarter then. Analyze this college football clip: Describe the key plays, player actions, and which team gained momentum. Note any turnovers or big gains. Finally announce this {comment} sent by anonomous fan in a humorous way.')
            ]
        )
    )
    return response

In [11]:
print(gen_caption(model="models/gemini-2.5-flash-lite", max_comments=2))

sdk_http_response=HttpResponse(
  headers=<dict len=11>
) candidates=[Candidate(
  content=Content(
    parts=[
      Part(
        text="""Alright, it's the second quarter with :49 seconds left on the clock, and we've got a classic Big Noon Saturday matchup brewing! USC's Trojans are trailing the Illinois Fighting Illini 14-7.

It looks like Illinois tried to punch it in from the one-yard line, but a flag comes out. It appears to be a holding penalty, which is going to push them back. That's a real bummer for the Illini, who were trying to extend their lead.

Then, we see a scramble for the ball, and it looks like USC might have recovered! It's chaos out there, folks! The USC players are fired up, and you can see them celebrating. This turnover has definitely shifted the momentum in favor of the Trojans. They've got a chance to tie this thing up before the half.

And that's a wrap on this clip, sent in by an anonymous fan. Looks like this one's a DABOVER!"""
      ),
    ],
    role='

In [None]:
#list of every file in directory
video_paths = []
for filename in os.listdir('USC_NOIS_CLIPS'):
    video_file_name = f'USC_NOIS_CLIPS/{filename}'
    video_paths.append(video_file_name)

comments = []


df = pd.DataFrame(video_paths, columns=['video_path'])
df['comment'] = comments

for index, row in df.iterrows():
    video_path = row['video_path']
    comment = row['comment']
    print(f'Generating caption for {video_path} with comment {comment}')
    response = gen_caption(video_file_name=video_path, comment=comment)
    print(response)
    time.sleep(60) # 1 minute sleep to avoid rate limits
    df.at[index, 'response'] = str(response)
    df.to_csv('video_comments_responses.csv', index=False) #save after each iteration


