In [1]:
import os
import json
import random
import shutil
from collections import defaultdict

# === Paths ===
VIDEO_DIR = "/kaggle/input/ucaucf-crime-annotation-dataset/UCF_Crimes/UCF_Crimes/Videos"
ANNOTATION_JSON_PATH = "/kaggle/input/ucaucf-crime-annotation-dataset/UCFCrime_Train.json"
OUTPUT_DIR = "/kaggle/working/sampled_ucf_videos"
FLAT_VIDEO_DIR = os.path.join(OUTPUT_DIR, "videos")
OUTPUT_JSON_PATH = os.path.join(OUTPUT_DIR, "sampled_annotations.json")

# === Target Categories (explicit)
TARGET_CATEGORIES = [
    "Abuse", "Arrest", "Arson", "Assault", "Burglary", "Explosion",
    "Fighting", "RoadAccidents", "Robbery", "Shooting", "Shoplifting",
    "Stealing", "Vandalism", "z_Normal_Videos_event"
]

SAMPLES_PER_CATEGORY = 32

# === Load Annotations ===
with open(ANNOTATION_JSON_PATH, "r") as f:
    annotations = json.load(f)

# === Prepare output folders ===
os.makedirs(FLAT_VIDEO_DIR, exist_ok=True)

# === Initialize sampled dataset
sampled_data = {}
category_sample_counts = {}

# === Process real categories
for category in TARGET_CATEGORIES:
    category_path = os.path.join(VIDEO_DIR, category)
    if not os.path.isdir(category_path):
        print(f" Skipping missing category folder: {category}")
        continue

    # Get video files
    all_files = [
        f for f in os.listdir(category_path)
        if f.endswith(('.mp4', '.avi', '.mkv'))
    ]

    if not all_files:
        print(f" No video files found in: {category}")
        continue

    sample_size = min(SAMPLES_PER_CATEGORY, len(all_files))
    sampled_files = random.sample(all_files, sample_size)
    category_sample_counts[category] = sample_size

    print(f" Sampling {sample_size} videos from: {category}")

    for filename in sampled_files:
        src = os.path.join(category_path, filename)
        dst = os.path.join(FLAT_VIDEO_DIR, filename)
        shutil.copy2(src, dst)

        video_name = os.path.splitext(filename)[0]
        if video_name in annotations:
            sampled_data[video_name] = annotations[video_name]
        else:
            # If not annotated (e.g., normal), insert placeholder
            sampled_data[video_name] = {
                "duration": 0,
                "timestamps": [],
                "sentences": ["Normal activity"]
            }

# === Save new annotations file
with open(OUTPUT_JSON_PATH, "w") as out_json:
    json.dump(sampled_data, out_json, indent=4)

# === Summary
print("\n Sampling complete!")
print(f"All videos saved to: {FLAT_VIDEO_DIR}")
print(f"Annotations saved to: {OUTPUT_JSON_PATH}")
print(f"\n Category sampling summary:")
for cat, count in category_sample_counts.items():
    print(f" - {cat}: {count} videos")
print(f"\n Total videos sampled: {len(sampled_data)}")


 Sampling 32 videos from: Abuse
 Sampling 32 videos from: Arrest
 Sampling 32 videos from: Arson
 Sampling 32 videos from: Assault
 Sampling 32 videos from: Burglary
 Sampling 32 videos from: Explosion
 Sampling 32 videos from: Fighting
 Sampling 32 videos from: RoadAccidents
 Sampling 32 videos from: Robbery
 Sampling 32 videos from: Shooting
 Sampling 32 videos from: Shoplifting
 Sampling 32 videos from: Stealing
 Sampling 32 videos from: Vandalism
 Sampling 32 videos from: z_Normal_Videos_event

 Sampling complete!
All videos saved to: /kaggle/working/sampled_ucf_videos/videos
Annotations saved to: /kaggle/working/sampled_ucf_videos/sampled_annotations.json

 Category sampling summary:
 - Abuse: 32 videos
 - Arrest: 32 videos
 - Arson: 32 videos
 - Assault: 32 videos
 - Burglary: 32 videos
 - Explosion: 32 videos
 - Fighting: 32 videos
 - RoadAccidents: 32 videos
 - Robbery: 32 videos
 - Shooting: 32 videos
 - Shoplifting: 32 videos
 - Stealing: 32 videos
 - Vandalism: 32 videos
 - 

In [2]:
# import os
# import json
# import random
# import shutil
# from collections import defaultdict

# # === Paths ===
# VIDEO_DIR = "/kaggle/input/ucaucf-crime-annotation-dataset/UCF_Crimes/UCF_Crimes/Videos"
# ANNOTATION_JSON_PATH = "/kaggle/input/ucaucf-crime-annotation-dataset/UCFCrime_Train.json"
# OUTPUT_DIR = "/kaggle/working/sampled_ucf_videos"
# FLAT_VIDEO_DIR = os.path.join(OUTPUT_DIR, "videos")
# OUTPUT_JSON_PATH = os.path.join(OUTPUT_DIR, "sampled_annotations.json")

# # === Load Annotations ===
# with open(ANNOTATION_JSON_PATH, "r") as f:
#     annotations = json.load(f)

# # === Real categories based on folder names ===
# # Ignoring testing/normal/random categories
# ignored_folders = {
#     "Testing_Normal_Videos_Anomaly",
#     "Training_Normal_Videos_Anomaly",
#     "z_Normal_Videos_event"
# }

# all_folders = os.listdir(VIDEO_DIR)
# real_categories = sorted([f for f in all_folders if f not in ignored_folders and os.path.isdir(os.path.join(VIDEO_DIR, f))])

# # === Organize Videos by Category from annotations ===
# category_videos = defaultdict(list)

# for video_name in annotations:
#     for category in real_categories:
#         if video_name.startswith(category):
#             category_videos[category].append(video_name)
#             break

# # === Prepare output dirs ===
# os.makedirs(FLAT_VIDEO_DIR, exist_ok=True)

# # === Sampling ===
# sampled_data = {}
# SAMPLE_LIMIT = 10

# for category in real_categories:
#     videos = category_videos.get(category, [])
#     sample_size = min(SAMPLE_LIMIT, len(videos))

#     print(f"üì¶ Sampling {sample_size:2} from category: {category}")

#     sampled = random.sample(videos, sample_size)

#     for video_name in sampled:
#         video_found = False
#         for ext in ['.mp4', '.avi', '.mkv']:
#             video_file = video_name + ext
#             src_video_path = os.path.join(VIDEO_DIR, category, video_file)
#             dst_video_path = os.path.join(FLAT_VIDEO_DIR, video_file)
#             if os.path.exists(src_video_path):
#                 shutil.copy2(src_video_path, dst_video_path)
#                 sampled_data[video_name] = annotations[video_name]
#                 video_found = True
#                 break
#         if not video_found:
#             print(f"‚ö†Ô∏è File not found for: {video_name} in {category}")

# # === Save new annotations ===
# with open(OUTPUT_JSON_PATH, "w") as out_json:
#     json.dump(sampled_data, out_json, indent=4)

# print(f"\n‚úÖ Sampling complete! Videos saved in: {FLAT_VIDEO_DIR}")
# print(f"‚úÖ Sampled annotations saved to: {OUTPUT_JSON_PATH}")
