In [15]:
import os
from mlu_tools.validation import validate_file
from sklearn.model_selection import train_test_split
import mlu_tools.utils as mlutils
import shutil

In [7]:
collected_data_dir = "collected_data"
os.path.exists(collected_data_dir)

True

In [8]:
def get_filepath_list(directory, for_class, type="vid"):
    filepath_list = []
    for root, dirs, files in os.walk(directory):
        if files and os.path.basename(root) == for_class:
            for file in files:
                filepath = os.path.join(root, file)
                if validate_file(filepath, raise_exception=False, valid_types=[type]):
                    filepath_list.append(filepath)
    return filepath_list

In [9]:
filepath_list = get_filepath_list(collected_data_dir, for_class="0")

In [10]:
len(filepath_list)

12

In [11]:
collected_dataset = {"train": [], "val": [], "test": []}
for i in range(3):
    train_set = []
    val_set = []
    test_set = []
    filepath_list = get_filepath_list(collected_data_dir, for_class=str(i))
    X_train, X_test = train_test_split(filepath_list, random_state=42, test_size=.3)
    X_val, X_test = train_test_split(X_test, random_state=42, test_size=.5)
    collected_dataset["train"].append(X_train)
    collected_dataset["val"].append(X_val)
    collected_dataset["test"].append(X_test)

In [3]:
# Downloading previous dataset
previous_dataset_url = "https://mega.nz/file/vzRgwI5b#MgofnS2OXEJGAJ3cXboGNqqhMOO97jAoXXlpt_EzuhU"
previous_dataset_path = "dataset_sm_manual_split.zip"
mlutils.download(previous_dataset_url, previous_dataset_path, download_from="mega", force=True)

100%|██████████| 340M/340M [04:00<00:00, 1.41MiB/s] 


File downloaded: dataset_sm_manual_split.zip


In [28]:
mlutils.unpack_archive(previous_dataset_path)

Archive unpacked to ./dataset_sm_manual_split


In [4]:
previous_dataset_extracted_path = "./dataset_sm_manual_split"

In [30]:
merged_dataset_dir = "dataset_merged"
shutil.copytree(previous_dataset_extracted_path, merged_dataset_dir)

'dataset_merged'

In [31]:
mlutils.tree(previous_dataset_extracted_path)

└── dataset_sm_manual_split
    └── videos
        ├── test
        │   ├── 0 - 6
        │   ├── 1 - 7
        │   └── 2 - 6
        ├── train
        │   ├── 0 - 12
        │   ├── 1 - 11
        │   └── 2 - 12
        └── val
            ├── 0 - 7
            ├── 1 - 7
            └── 2 - 7


In [34]:
mlutils.tree(collected_data_dir)

└── collected_data
    ├── with object and with sleeve
    │   ├── with bucket
    │   │   ├── 0 - 2
    │   │   ├── 1 - 2
    │   │   └── 2 - 2
    │   └── without bucket
    │       ├── 0 - 2
    │       ├── 1 - 2
    │       └── 2 - 2
    ├── with object and without sleeve
    │   ├── with bucket
    │   │   ├── 0 - 2
    │   │   ├── 1 - 2
    │   │   └── 2 - 2
    │   └── without bucket
    │       ├── 0 - 2
    │       ├── 1 - 2
    │       └── 2 - 2
    ├── without object and with sleeve
    │   ├── 0 - 2
    │   ├── 1 - 2
    │   └── 2 - 2
    └── without object and without sleeve
        ├── 0 - 2
        ├── 1 - 2
        └── 2 - 2


In [32]:
for split_name in collected_dataset:  # split_name is one of the following string: "train", "val", "test"
    for class_label, filepaths in enumerate(collected_dataset[split_name]):
        for filepath in filepaths:
            filename = os.path.basename(filepath)
            shutil.copy(
                filepath, 
                os.path.join(merged_dataset_dir, "videos", split_name, str(class_label))
            )

In [33]:
mlutils.tree(merged_dataset_dir)

└── dataset_merged
    └── videos
        ├── test
        │   ├── 0 - 8
        │   ├── 1 - 9
        │   └── 2 - 8
        ├── train
        │   ├── 0 - 20
        │   ├── 1 - 19
        │   └── 2 - 20
        └── val
            ├── 0 - 9
            ├── 1 - 9
            └── 2 - 9


In [40]:
!zip -r "{merged_dataset_dir}.zip" "{merged_dataset_dir}"

  adding: dataset_merged/ (stored 0%)
  adding: dataset_merged/videos/ (stored 0%)
  adding: dataset_merged/videos/val/ (stored 0%)
  adding: dataset_merged/videos/val/0/ (stored 0%)
  adding: dataset_merged/videos/val/0/HandWash_024_A_12_G_05.mp4 (deflated 0%)
  adding: dataset_merged/videos/val/0/20250125_144612.mp4 (deflated 1%)
  adding: dataset_merged/videos/val/0/HandWash_022_A_11_G_05.mp4 (deflated 0%)
  adding: dataset_merged/videos/val/0/20250125_141054.mp4 (deflated 1%)
  adding: dataset_merged/videos/val/0/HandWash_008_A_11_G_01.mp4 (deflated 0%)
  adding: dataset_merged/videos/val/0/HandWash_014_A_11_G_03.mp4 (deflated 0%)
  adding: dataset_merged/videos/val/0/HandWash_011_A_12_G_02.mp4 (deflated 0%)
  adding: dataset_merged/videos/val/0/HandWash_007_A_12_G_01.mp4 (deflated 0%)
  adding: dataset_merged/videos/val/0/HandWash_016_A_11_G_04.mp4 (deflated 0%)
  adding: dataset_merged/videos/val/1/ (stored 0%)
  adding: dataset_merged/videos/val/1/HandWash_010_A_01_G02.mp4 (defl