In [1]:
!apt-get -qq update
!apt-get -qq install -y ffmpeg

!pip install --quiet torch
!pip install --quiet torchvision
!pip install --quiet transformers
!pip install --quiet accelerate
!pip install --quiet bitsandbytes
!pip install --quiet opencv-python
!pip install --quiet pandas
!pip install --quiet matplotlib

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m117.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m95.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m60.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.9 MB

In [3]:
# Downloading UMN video & splitting into 4 small clips
!wget -q http://mha.cs.umn.edu/Movies/Crowd-Activity-All.avi -O full_crowd.avi
!mkdir -p data/videos

# two calm video segments
!ffmpeg -y -i full_crowd.avi -ss 0    -t 15 -c copy data/videos/calm_1.avi
!ffmpeg -y -i full_crowd.avi -ss 15   -t 15 -c copy data/videos/calm_2.avi

# two panic video segments
!ffmpeg -y -i full_crowd.avi -ss 60   -t 15 -c copy data/videos/panic_1.avi
!ffmpeg -y -i full_crowd.avi -ss 75   -t 15 -c copy data/videos/panic_2.avi

ffmpeg version 4.4.2-0ubuntu0.22.04.1 Copyright (c) 2000-2021 the FFmpeg developers
  built with gcc 11 (Ubuntu 11.2.0-19ubuntu1)
  configuration: --prefix=/usr --extra-version=0ubuntu0.22.04.1 --toolchain=hardened --libdir=/usr/lib/x86_64-linux-gnu --incdir=/usr/include/x86_64-linux-gnu --arch=amd64 --enable-gpl --disable-stripping --enable-gnutls --enable-ladspa --enable-libaom --enable-libass --enable-libbluray --enable-libbs2b --enable-libcaca --enable-libcdio --enable-libcodec2 --enable-libdav1d --enable-libflite --enable-libfontconfig --enable-libfreetype --enable-libfribidi --enable-libgme --enable-libgsm --enable-libjack --enable-libmp3lame --enable-libmysofa --enable-libopenjpeg --enable-libopenmpt --enable-libopus --enable-libpulse --enable-librabbitmq --enable-librubberband --enable-libshine --enable-libsnappy --enable-libsoxr --enable-libspeex --enable-libsrt --enable-libssh --enable-libtheora --enable-libtwolame --enable-libvidstab --enable-libvorbis --enable-libvpx --enab

In [4]:
# Prepare the folder for export
import os

clips = os.listdir("data/videos")
for clip in clips:
    name, _ = os.path.splitext(clip)
    os.makedirs(f"frames/{name}",   exist_ok=True)
    os.makedirs(f"captions/{name}", exist_ok=True)
for model in ("flan", "dolly"):
    os.makedirs(f"scores/{model}", exist_ok=True)
os.makedirs("plots", exist_ok=True)

In [5]:
# Extracting frames from all the clips
import cv2, glob

for clip_path in glob.glob("data/videos/*.avi"):
    clip_name = os.path.splitext(os.path.basename(clip_path))[0]
    cap = cv2.VideoCapture(clip_path)
    idx = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        cv2.imwrite(f"frames/{clip_name}/{clip_name}_frame_{idx:04d}.jpg", frame)
        idx += 1
    cap.release()
    print(f"→ {clip_name}: {idx} frames extracted")

→ panic_1: 248 frames extracted
→ calm_1: 450 frames extracted
→ panic_2: 398 frames extracted
→ calm_2: 300 frames extracted


In [6]:
# Load captioner and LLMs
import torch
from transformers import (
    pipeline,
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
)

device = 0 if torch.cuda.is_available() else -1

# Image-to-text captioner
captioner = pipeline(
    "image-to-text",
    model="nlpconnect/vit-gpt2-image-captioning",
    device=device
)

# FLAN-T5-small model
flan = pipeline(
    "text2text-generation",
    model="google/flan-t5-small",
    device=device,
    max_new_tokens=16
)

# Dolly-v2-3b model with 8-bit+offload
bnb = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_threshold=6.0,
    llm_int8_enable_fp32_cpu_offload=True
)
dolly_tok = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", use_fast=True)
dolly_mod = AutoModelForCausalLM.from_pretrained(
    "databricks/dolly-v2-3b",
    quantization_config=bnb,
    device_map="auto",
    torch_dtype=torch.float16
)
dolly = pipeline(
    "text-generation",
    model=dolly_mod,
    tokenizer=dolly_tok,
    max_new_tokens=16,
    do_sample=False
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/982M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda:0


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cuda:0


tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


In [7]:
# Generating captions for all the frames
import glob, os

for clip_name in os.listdir("frames"):
    frame_files = sorted(glob.glob(f"frames/{clip_name}/*.jpg"))
    for img_path in frame_files:
        base = os.path.splitext(os.path.basename(img_path))[0]
        text = captioner(img_path)[0]["generated_text"]
        with open(f"captions/{clip_name}/{base}.txt", "w") as f:
            f.write(text)
    print(f"→ {clip_name}: {len(frame_files)} captions generated")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.53.0. You should pass an instance of `Cache` instead, e.g. `past_key_values=DynamicCache.from_legacy_cache(past_key_values)`.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


→ calm_1: 450 captions generated
→ calm_2: 300 captions generated
→ panic_2: 398 captions generated
→ panic_1: 248 captions generated


In [8]:
# Scoring every 5th frame
import os, glob, pandas as pd

step = 5
models = [("flan", flan), ("dolly", dolly)]

for clip_name in os.listdir("captions"):
    txt_files = sorted(os.listdir(f"captions/{clip_name}"))[::step]
    for model_name, pipe in models:
        recs = []
        for txt_file in txt_files:
            txt = open(f"captions/{clip_name}/{txt_file}").read()
            if model_name == "flan":
                out = pipe(f"Rate anomaly 0.0–1.0: {txt}")[0]["generated_text"]
            else:
                prompt = (
                    "Rate anomaly from 0.0 (normal) to 1.0 (highly anomalous):\n\n"
                    f"{txt}\n\nAnswer with a single number."
                )
                out = pipe(prompt)[0]["generated_text"]

            toks = [t for t in out.replace(",", " ").split()
                    if t.replace(".", "", 1).isdigit()]
            score = float(toks[-1]) if toks else 0.0
            recs.append((txt_file.replace(".txt", ".jpg"), score))
        df = pd.DataFrame(recs, columns=["frame", "score"])
        df.to_csv(f"scores/{model_name}/{clip_name}_scores.csv", index=False)
        print(f"✔ {model_name.upper()} → {clip_name}: {len(recs)} scores")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✔ FLAN → calm_1: 90 scores


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore

✔ DOLLY → calm_1: 90 scores


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✔ FLAN → calm_2: 60 scores


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore

✔ DOLLY → calm_2: 60 scores


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✔ FLAN → panic_2: 80 scores


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore

✔ DOLLY → panic_2: 80 scores


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


✔ FLAN → panic_1: 50 scores


The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The following generation flags are not valid and may be ignore

✔ DOLLY → panic_1: 50 scores


In [20]:
# ROC & PR curves with per-model labels
import glob
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

def label_from_name(fname):
    return 1 if "panic" in fname.lower() else 0

# Build per-model score & label lists
results = {}
for model in ("flan", "dolly"):
    scores = []
    labels = []
    pattern = f"scores/{model}/*_scores.csv"
    for path in glob.glob(pattern):
        df = pd.read_csv(path)
        lbl = label_from_name(os.path.basename(path))
        scores.extend(df["score"].tolist())
        labels.extend([lbl] * len(df))
    results[model] = (np.array(labels), np.array(scores))

# Plot ROC curves
plt.figure(figsize=(6,5))
for model, (y_true, y_score) in results.items():
    fpr, tpr, _ = roc_curve(y_true, y_score)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f"{model.upper()} (AUC = {roc_auc:.2f})")
plt.plot([0,1],[0,1],"k--", label="Chance")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC: Calm vs. Panic")
plt.legend(loc="lower right")
plt.tight_layout()
plt.savefig("plots/roc_curve.png")
plt.close()

✅ Saved: plots/roc_curve.png, plots/pr_curve.png


In [23]:
# Plot Precision–Recall curves
plt.figure(figsize=(6,5))
for model, (y_true, y_score) in results.items():
    precision, recall, _ = precision_recall_curve(y_true, y_score)
    ap = average_precision_score(y_true, y_score)
    plt.plot(recall, precision, label=f"{model.upper()} (AP = {ap:.2f})")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("PR Curve: Calm vs. Panic")
plt.legend(loc="lower left")
plt.tight_layout()
plt.savefig("plots/pr_curve.png")
plt.close()

✅ Saved: plots/roc_curve.png, plots/pr_curve.png


In [21]:
# ── Cell 10: F1 Score vs. Decision Threshold ──
import glob, os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score

# Load all scores & labels
models = ["flan", "dolly"]
all_labels = {}
all_scores = {}
for m in models:
    labels, scores = [], []
    for path in glob.glob(f"scores/{m}/*_scores.csv"):
        lbl = 1 if "panic" in path.lower() else 0
        df = pd.read_csv(path)
        labels.extend([lbl] * len(df))
        scores.extend(df["score"].tolist())
    all_labels[m] = np.array(labels)
    all_scores[m] = np.array(scores)

# Evaluate F1 at thresholds from 0.0 to 1.0
thresholds = np.linspace(0, 1, 50)
plt.figure(figsize=(6,4))
for m in models:
    f1s = []
    for t in thresholds:
        preds = (all_scores[m] >= t).astype(int)
        f1s.append(f1_score(all_labels[m], preds))
    plt.plot(thresholds, f1s, label=m.upper())
plt.xlabel("Anomaly Score Threshold")
plt.ylabel("F1 Score")
plt.title("F1 Score vs. Decision Threshold")
plt.legend()
plt.tight_layout()
plt.savefig("plots/f1_vs_threshold.png")
plt.close()

✅ Saved plots/f1_vs_threshold.png


In [22]:
# ── Cell 11: Calibration (Reliability) Curve ──
import numpy as np
import matplotlib.pyplot as plt
from sklearn.calibration import calibration_curve

models = ["flan", "dolly"]

plt.figure(figsize=(6,4))
for m in models:
    y_true = all_labels[m]
    y_prob = all_scores[m]
    prob_true, prob_pred = calibration_curve(y_true, y_prob, n_bins=10)
    plt.plot(prob_pred, prob_true, marker="o", label=m.upper())
# Perfectly calibrated line
plt.plot([0,1], [0,1], "k--", label="Perfectly Calibrated")
plt.xlabel("Mean Predicted Anomaly Probability")
plt.ylabel("Fraction of True Positives")
plt.title("Calibration Curve")
plt.legend()
plt.tight_layout()
plt.savefig("plots/calibration_curve.png")
plt.close()

✅ Saved plots/calibration_curve.png
