In [1]:
import os
current_working_directory = os.getcwd()

# hokey datasett:

In [None]:
import os
import shutil

def restructure_videos(input_path):
    # Define the target directories for Fight and NonFight
    fight_dir = os.path.join(input_path, 'Fight')
    nonfight_dir = os.path.join(input_path, 'NonFight')

    # Create the directories if they do not exist
    if not os.path.exists(fight_dir):
        os.makedirs(fight_dir)

    if not os.path.exists(nonfight_dir):
        os.makedirs(nonfight_dir)

    # Iterate over files in the input directory
    for file_name in os.listdir(input_path):
        file_path = os.path.join(input_path, file_name)

        # Skip directories
        if os.path.isdir(file_path):
            continue

        # Check if the file name starts with "fi" (Fight) or "no" (NonFight)
        if file_name.lower().startswith('fi'):
            # Move to 'fight' directory
            shutil.move(file_path, os.path.join(fight_dir, file_name))
            print(f"Moved {file_name} to 'fight' directory")
        elif file_name.lower().startswith('no'):
            # Move to 'nonfight' directory
            shutil.move(file_path, os.path.join(nonfight_dir, file_name))
            print(f"Moved {file_name} to 'nonfight' directory")
        else:
            print(f"Skipping {file_name} (does not match 'fi' or 'no')")

# Example usage
# input_path = os.path.join(current_working_directory, "archive", "hockey")
# restructure_videos(input_path)


# Airtby datasett

In [None]:
import os
import shutil
import uuid

def restructure_dataset(input_dir, output_dir):
    """
    Restructure dataset to match the desired format with unique IDs.
    
    Parameters:
        input_dir (str): Path to the input dataset directory.
        output_dir (str): Path to the output dataset directory.
    """
    # Ensure output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Create subdirectories for "Fight" and "NonFight"
    fight_dir = os.path.join(output_dir, "Fight")
    non_fight_dir = os.path.join(output_dir, "NonFight")
    os.makedirs(fight_dir, exist_ok=True)
    os.makedirs(non_fight_dir, exist_ok=True)

    # Process "violent" and "non-violent" directories
    for category in ["violent", "non-violent"]:
        category_path = os.path.join(input_dir, category)

        if not os.path.exists(category_path):
            print(f"Category directory not found: {category_path}")
            continue

        # Determine target directory
        target_dir = fight_dir if category == "violent" else non_fight_dir

        # Process subdirectories (e.g., "cam1", "cam2")
        for subdir in os.listdir(category_path):
            subdir_path = os.path.join(category_path, subdir)

            if not os.path.isdir(subdir_path):
                continue

            # Process video files in each subdirectory
            for filename in os.listdir(subdir_path):
                file_path = os.path.join(subdir_path, filename)

                if not os.path.isfile(file_path):
                    continue

                # Generate a unique ID for each video
                unique_id = str(uuid.uuid4()) + os.path.splitext(filename)[1]

                # Copy the file to the target directory
                target_file_path = os.path.join(target_dir, unique_id)
                shutil.copy(file_path, target_file_path)

    print(f"Dataset restructured successfully to: {output_dir}")


# # Example usage
# input_dataset_path = os.path.join(current_working_directory, "archive", "violence-detection-dataset")
# output_dataset_path = os.path.join(current_working_directory, "archive", "airtby")
# restructure_dataset(input_dataset_path, output_dataset_path)


# split it to train and val

## raw

In [None]:
import os
import shutil
import random

def split_dataset(input_dir, output_dir, train_ratio=0.8):
    """
    Split dataset into training and validation sets.

    Parameters:
        input_dir (str): Path to the input dataset directory (with "Fight" and "NonFight").
        output_dir (str): Path to the output dataset directory.
        train_ratio (float): Ratio of data to use for training (default is 80%).
    """
    # Ensure output directory exists
    train_dir = os.path.join(output_dir, "train")
    val_dir = os.path.join(output_dir, "val")
    
    for dir_path in [train_dir, val_dir]:
        os.makedirs(os.path.join(dir_path, "Fight"), exist_ok=True)
        os.makedirs(os.path.join(dir_path, "NonFight"), exist_ok=True)

    # Process each category (Fight and NonFight)
    for label in ["Fight", "NonFight"]:
        input_label_dir = os.path.join(input_dir, label)
        train_label_dir = os.path.join(train_dir, label)
        val_label_dir = os.path.join(val_dir, label)

        # Get all video files in the category
        videos = [f for f in os.listdir(input_label_dir) if os.path.isfile(os.path.join(input_label_dir, f))]
        random.shuffle(videos)  # Shuffle to ensure randomness

        # Calculate split point
        split_index = int(len(videos) * train_ratio)

        # Split videos into training and validation sets
        train_videos = videos[:split_index]
        val_videos = videos[split_index:]

        # Copy files to the respective directories
        for video in train_videos:
            shutil.copy(os.path.join(input_label_dir, video), os.path.join(train_label_dir, video))

        for video in val_videos:
            shutil.copy(os.path.join(input_label_dir, video), os.path.join(val_label_dir, video))

    print(f"Dataset split successfully into training and validation sets at: {output_dir}")


# Example usage
input_dataset_path = os.path.join(current_working_directory, "archive", "airtby")
output_dataset_path = os.path.join(current_working_directory, "archive", "airtby")
split_dataset(input_dataset_path, output_dataset_path)


## processed

In [None]:
import os
import shutil
import random
from tqdm import tqdm

def split_dataset_into_train_val(input_dir, output_dir, train_ratio=0.8):
    """
    Split the dataset into train and val directories, maintaining structure.

    Args:
        input_dir (str): Path to the input dataset directory.
        output_dir (str): Path to the output directory for the split dataset.
        train_ratio (float): Proportion of data to use for training.
    """
    # Categories (Fight and NonFight)
    categories = ["Fight", "NonFight"]

    for category in categories:
        input_category_dir = os.path.join(input_dir, category)
        if not os.path.exists(input_category_dir):
            print(f"Category directory {input_category_dir} does not exist. Skipping.")
            continue
        
        # Collect all subdirectories
        video_folders = [f for f in os.listdir(input_category_dir) if os.path.isdir(os.path.join(input_category_dir, f))]
        random.shuffle(video_folders)

        # Split into train and val
        split_index = int(len(video_folders) * train_ratio)
        train_folders = video_folders[:split_index]
        val_folders = video_folders[split_index:]

        # Copy files to the new structure
        for split, folders in [("train", train_folders), ("val", val_folders)]:
            split_category_dir = os.path.join(output_dir, split, category)
            os.makedirs(split_category_dir, exist_ok=True)

            for folder in tqdm(folders, desc=f"Processing {category} for {split}"):
                src_folder = os.path.join(input_category_dir, folder)
                dest_folder = os.path.join(split_category_dir, folder)
                shutil.copytree(src_folder, dest_folder, dirs_exist_ok=True)

input_dataset_dir = os.path.join(current_working_directory, "archive", "Keypoints-hokey")
output_dataset_dir = os.path.join(current_working_directory, "archive", "Keypoints-hokey")
split_dataset_into_train_val(input_dataset_dir, output_dataset_dir)


# Merge datasetts

In [2]:
import os
import shutil
from tqdm import tqdm

def merge_datasets(dataset_paths, output_dir):
    """
    Merge multiple datasets into a single dataset structure.

    Args:
        dataset_paths (list): List of dataset paths to merge.
        output_dir (str): Path to the output unified dataset.
    """
    for split in ["train", "val"]:
        for category in ["Fight", "NonFight"]:
            unified_dir = os.path.join(output_dir, split, category)
            os.makedirs(unified_dir, exist_ok=True)

            for dataset_path in dataset_paths:
                source_dir = os.path.join(dataset_path, split, category)
                if os.path.exists(source_dir):
                    for video_folder in tqdm(
                        os.listdir(source_dir),
                        desc=f"Merging {split}/{category} from {dataset_path}",
                    ):
                        source_folder_path = os.path.join(source_dir, video_folder)
                        dest_folder_path = os.path.join(unified_dir, video_folder)

                        if os.path.isdir(source_folder_path):
                            # Handle name conflicts by appending a unique suffix
                            if os.path.exists(dest_folder_path):
                                base_name = os.path.basename(video_folder)
                                dest_folder_path = os.path.join(
                                    unified_dir, f"{base_name}_{os.urandom(4).hex()}"
                                )
                            shutil.copytree(
                                source_folder_path, dest_folder_path, dirs_exist_ok=True
                            )
                        else:
                            print(f"Skipping non-directory: {source_folder_path}")

if __name__ == "__main__":
    # List of dataset paths to merge
    dataset_paths = [
        os.path.join(current_working_directory, "archive", "Keypoints-airtby"),
        os.path.join(current_working_directory, "archive", "Keypoints-hokey"),
        os.path.join(current_working_directory, "archive", "Keypoints-rwf-2000"),
    ]
    # Unified output dataset directory
    output_dataset_dir = os.path.join(current_working_directory, "archive", "Keypoints-total")
    merge_datasets(dataset_paths, output_dataset_dir)


Merging train/Fight from c:\Users\gorme\projects\godseye\apps\backend\dataset_processing\archive\Keypoints-airtby: 100%|██████████| 184/184 [00:14<00:00, 12.82it/s]
Merging train/Fight from c:\Users\gorme\projects\godseye\apps\backend\dataset_processing\archive\Keypoints-hokey: 100%|██████████| 400/400 [00:17<00:00, 22.32it/s]
Merging train/Fight from c:\Users\gorme\projects\godseye\apps\backend\dataset_processing\archive\Keypoints-rwf-2000: 100%|██████████| 789/789 [00:46<00:00, 17.01it/s]
Merging train/NonFight from c:\Users\gorme\projects\godseye\apps\backend\dataset_processing\archive\Keypoints-airtby: 100%|██████████| 96/96 [00:06<00:00, 13.98it/s]
Merging train/NonFight from c:\Users\gorme\projects\godseye\apps\backend\dataset_processing\archive\Keypoints-hokey: 100%|██████████| 400/400 [00:18<00:00, 21.83it/s]
Merging train/NonFight from c:\Users\gorme\projects\godseye\apps\backend\dataset_processing\archive\Keypoints-rwf-2000: 100%|██████████| 802/802 [00:52<00:00, 15.15it/s]
M