In [1]:
import os
import random
import json

In [2]:
# Set path to dataset keypoints JSON files
KEYPOINTS_PATH = "../dataset/keypoints"
OUTPUT_DIR = "../dataset/data"

In [3]:
def create_dir(directory_path):
    """Creates a directory if it does not already exist."""
    os.makedirs(directory_path, exist_ok=True)

In [4]:
def load_keypoints_with_label(keypoints_path, label):
    """
    Loads keypoints for a given label, filters out invalid data, and adds the label to each entry.

    Args:
        keypoints_path (str): Path to the folder containing JSON files for keypoints.
        label (str): The label of the data (e.g., 'A', '1').

    Returns:
        list: Cleaned list of keypoints data with the label included.
    """
    label_file = os.path.join(keypoints_path, f"{label}.json")
    keypoints_data = []

    if not os.path.exists(label_file):
        print(f"JSON file not found: {label_file}")
        return keypoints_data

    with open(label_file, 'r') as json_file:
        data = json.load(json_file)

    # Clean the data: Remove entries with all zero keypoints
    for entry in data:
        body_keypoints = entry["keypoints"]["body"]
        hand_keypoints = entry["keypoints"]["hands"]

        # Check if all keypoints are zero
        if all(kp["x"] == 0.0 and kp["y"] == 0.0 and kp["z"] == 0.0 for kp in body_keypoints) and all(kp["x"] == 0.0 and kp["y"] == 0.0 and kp["z"] == 0.0 for kp in hand_keypoints):
            continue  # Skip this entry if all keypoints are zero

        keypoints_data.append({
            "label": label,
            "image_name": entry["image_name"],
            "keypoints": {
                "body": body_keypoints,
                "hands": hand_keypoints
            }
        })

    return keypoints_data

In [5]:
def split_data(data, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1):
    """
    Splits the data into train, validation, and test sets.

    Args:
        data (list): List of keypoints data.
        train_ratio (float): Fraction of data for the train set.
        val_ratio (float): Fraction of data for the validation set.
        test_ratio (float): Fraction of data for the test set.

    Returns:
        tuple: train, val, test lists of keypoints data.
    """
    random.shuffle(data)  # Shuffle the data
    total = len(data)
    
    train_end = int(train_ratio * total)
    val_end = train_end + int(val_ratio * total)

    train_data = data[:train_end]
    val_data = data[train_end:val_end]
    test_data = data[val_end:]

    return train_data, val_data, test_data

In [6]:
def save_split_data(train_data, val_data, test_data, output_path):
    """
    Saves all split data (train, val, test) into single JSON files for each split.

    Args:
        train_data (list): List of keypoints data for the training set.
        val_data (list): List of keypoints data for the validation set.
        test_data (list): List of keypoints data for the test set.
        output_path (str): Path to save the split JSON files.
    """
    all_splits = {
        "train": train_data,
        "val": val_data,
        "test": test_data
    }

    for split_type, split_data in all_splits.items():
        json_path = os.path.join(output_path, f"{split_type}_data.json")
        with open(json_path, 'w') as json_file:
            json.dump(split_data, json_file, indent=4)

In [7]:
def main():
    # Create output directory
    create_dir(OUTPUT_DIR)

    # Initialize lists to store all data for train, val, and test sets
    all_train_data = []
    all_val_data = []
    all_test_data = []

    # Process all labels and split their data
    labels = list("123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ")

    for label in labels:
        # Load keypoints for the label and clean data
        keypoints = load_keypoints_with_label(KEYPOINTS_PATH, label)
        
        if keypoints:  # If there is keypoints data for this label
            # Split the data into train, val, and test
            train_data, val_data, test_data = split_data(keypoints)

            # Append the split data to the respective lists
            all_train_data.extend(train_data)
            all_val_data.extend(val_data)
            all_test_data.extend(test_data)

    # Save the final split data into separate JSON files for train, val, and test
    save_split_data(all_train_data, all_val_data, all_test_data, OUTPUT_DIR)

    print("Keypoints data split into train, validation, and test sets complete.")

In [8]:
if __name__ == "__main__":
    main()

Keypoints data split into train, validation, and test sets complete.
