In [1]:
import os
import json
import random
import shutil
from collections import defaultdict

# === Paths ===
VIDEO_DIR = "/kaggle/input/ucaucf-crime-annotation-dataset/UCF_Crimes/UCF_Crimes/Videos"
ANNOTATION_JSON_PATH = "/kaggle/input/ucaucf-crime-annotation-dataset/UCFCrime_Train.json"
OUTPUT_DIR = "/kaggle/working/sampled_ucf_videos"
FLAT_VIDEO_DIR = os.path.join(OUTPUT_DIR, "videos")
OUTPUT_JSON_PATH = os.path.join(OUTPUT_DIR, "sampled_annotations.json")

# === Target Categories (explicit)
TARGET_CATEGORIES = [
    "Abuse", "Arrest", "Arson", "Assault", "Burglary", "Explosion",
    "Fighting", "RoadAccidents", "Robbery", "Shooting", "Shoplifting",
    "Stealing", "Vandalism", "z_Normal_Videos_event"
]

SAMPLES_PER_CATEGORY = 25

# === Load Annotations ===
with open(ANNOTATION_JSON_PATH, "r") as f:
    annotations = json.load(f)

# === Prepare output folders ===
os.makedirs(FLAT_VIDEO_DIR, exist_ok=True)

# === Initialize sampled dataset
sampled_data = {}
category_sample_counts = {}

# === Process real categories
for category in TARGET_CATEGORIES:
    category_path = os.path.join(VIDEO_DIR, category)
    if not os.path.isdir(category_path):
        print(f"❌ Skipping missing category folder: {category}")
        continue

    # Get video files
    all_files = [
        f for f in os.listdir(category_path)
        if f.endswith(('.mp4', '.avi', '.mkv'))
    ]

    if not all_files:
        print(f"⚠️ No video files found in: {category}")
        continue

    sample_size = min(SAMPLES_PER_CATEGORY, len(all_files))
    sampled_files = random.sample(all_files, sample_size)
    category_sample_counts[category] = sample_size

    print(f"📦 Sampling {sample_size} videos from: {category}")

    for filename in sampled_files:
        src = os.path.join(category_path, filename)
        dst = os.path.join(FLAT_VIDEO_DIR, filename)
        shutil.copy2(src, dst)

        video_name = os.path.splitext(filename)[0]
        if video_name in annotations:
            sampled_data[video_name] = annotations[video_name]
        else:
            # If not annotated (e.g., normal), insert placeholder
            sampled_data[video_name] = {
                "duration": 0,
                "timestamps": [],
                "sentences": ["Normal activity"]
            }

# === Save new annotations file
with open(OUTPUT_JSON_PATH, "w") as out_json:
    json.dump(sampled_data, out_json, indent=4)

# === Summary
print("\n✅ Sampling complete!")
print(f"📂 All videos saved to: {FLAT_VIDEO_DIR}")
print(f"📝 Annotations saved to: {OUTPUT_JSON_PATH}")
print(f"\n📊 Category sampling summary:")
for cat, count in category_sample_counts.items():
    print(f" - {cat}: {count} videos")
print(f"\n📦 Total videos sampled: {len(sampled_data)}")


📦 Sampling 25 videos from: Abuse
📦 Sampling 25 videos from: Arrest
📦 Sampling 25 videos from: Arson
📦 Sampling 25 videos from: Assault
📦 Sampling 25 videos from: Burglary
📦 Sampling 25 videos from: Explosion
📦 Sampling 25 videos from: Fighting
📦 Sampling 25 videos from: RoadAccidents
📦 Sampling 25 videos from: Robbery
📦 Sampling 25 videos from: Shooting
📦 Sampling 25 videos from: Shoplifting
📦 Sampling 25 videos from: Stealing
📦 Sampling 25 videos from: Vandalism
📦 Sampling 25 videos from: z_Normal_Videos_event

✅ Sampling complete!
📂 All videos saved to: /kaggle/working/sampled_ucf_videos/videos
📝 Annotations saved to: /kaggle/working/sampled_ucf_videos/sampled_annotations.json

📊 Category sampling summary:
 - Abuse: 25 videos
 - Arrest: 25 videos
 - Arson: 25 videos
 - Assault: 25 videos
 - Burglary: 25 videos
 - Explosion: 25 videos
 - Fighting: 25 videos
 - RoadAccidents: 25 videos
 - Robbery: 25 videos
 - Shooting: 25 videos
 - Shoplifting: 25 videos
 - Stealing: 25 videos
 - Vand

In [2]:
import shutil

# Compress the folder into a .zip file
shutil.make_archive('/kaggle/working/sampled_ucf_videos', 'zip', '/kaggle/working/sampled_ucf_videos')


'/kaggle/working/sampled_ucf_videos.zip'