<a href="https://colab.research.google.com/github/vuong-viet-hung/Electrical-Component-Recognition/blob/main/Dataset%20Preparation/Dataset_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%cd '/content/drive/MyDrive/Mandevices/Machine Learning/Electronic Component Recognition'/

/content/drive/MyDrive/Mandevices/Machine Learning/Electronic Component Recognition


In [None]:
%mkdir kaggle/

In [None]:
%cd kaggle/
from google.colab import files
files.upload()
%cd ../

/content/drive/My Drive/Mandevices/Machine Learning/Electronic Component Recognition/kaggle


Saving kaggle.json to kaggle.json
/content/drive/My Drive/Mandevices/Machine Learning/Electronic Component Recognition


In [None]:
import os
os.environ['KAGGLE_CONFIG_DIR'] = '/content/drive/MyDrive/Mandevices/Machine Learning/Electronic Component Recognition/kaggle/'

In [None]:
%cd 'Orginal Dataset'/
!kaggle datasets download -d mrojer/electronic-components-for-automatic-detection
!unzip electronic-components-for-automatic-detection.zip
%cd ../

/content/drive/My Drive/Mandevices/Machine Learning/Electronic Component Recognition/Dataset
Downloading electronic-components-for-automatic-detection.zip to /content/drive/MyDrive/Mandevices/Machine Learning/Electronic Component Recognition/Dataset
100% 466M/468M [00:05<00:00, 113MB/s]
100% 468M/468M [00:05<00:00, 96.5MB/s]
/content/drive/My Drive/Mandevices/Machine Learning/Electronic Component Recognition


In [None]:
import shutil
import itertools
from collections import defaultdict
from pathlib import Path
from typing import Callable

In [None]:
ORIGINAL_DATASET_PATH = Path('Original Dataset/ECAD_dataset/')
DATASET_PATH = Path('Dataset/')

In [None]:
def get_train_valid_test(image_id: str) -> str:
    """
    Return the train set's, valid set's or test set's name in 0.8:0.1:0.1 ratio.
    """
    if image_id % 10 == 0:
        return 'Test'
    elif image_id % 10 == 1:
        return 'Valid'
    else:
        return 'Train'   

In [None]:
class Dataset:
    def __init__(self, original_dataset_path: Path, dataset_path: Path) -> None:
        self.__original_dataset_path: Path = original_dataset_path
        self.__dataset_path: Path = dataset_path
        self.__images_count: defaultdict[str, int] = defaultdict(int)

    def __create_dirs(self) -> None:
        """Create dataset directory and subdirectories."""
        train_val_test_dirs = {'Train', 'Valid', 'Test'}
        class_names = [class_dir_path.name for 
                       class_dir_path in self.__original_dataset_path.iterdir()]
        class_names.append('background')
        for train_val_test_dir, class_name in itertools.product(
            train_val_test_dirs, 
            class_names
        ):
            new_dir = (
                self.__dataset_path 
                / train_val_test_dir 
                / class_name
            )
            new_dir.mkdir(parents=True, exist_ok=True)

    def __copy_images(
        self, background_component_dir: Path, 
        class_name: str, get_train_valid_test: Callable[[str], str]) -> None:
        """Copy images from the original to the restructured dataset."""
        for src_image in background_component_dir.iterdir():
            self.__images_count[class_name] += 1
            test_train_valid_dir = get_train_valid_test(
                self.__images_count[class_name]
            )
            dest_image: Path = (
                self.__dataset_path
                / test_train_valid_dir
                / class_name
                / (str(self.__images_count[class_name]) + src_image.suffix)
            )
            shutil.copy(src_image, dest_image)

    def restructure(self) -> None:
        """
        Restructure the dataset.

        Original dataset:
            ECAD_dataset
            ├── dataset_1
            │   ├── Train
            │   │   ├── Background
            │   │   └──Component
            │   ├── Validation
            │   │   ├── Background
            │   │   └── Component
            │   ├── Test
            │   │   ├── Background
            │   │   └── Component
            │   └── dimensions.yml
            ├── dataset_2
            ├── dataset_3
            ├── ...
            └── dataset_9

        Resturctured dataset:
            Dataset
            ├── Train
            │   ├── background
            │   ├── dataset_1
            │   ├── dataset_2
            │   ├── dataset_3
            │   ├── ...
            │   └── dataset_9
            ├── Valid
            └── Test
        """
        self.__create_dirs()
        # class_dir: e.g. 'ECAD_dataset/dataset_1'
        for class_dir in self.__original_dataset_path.iterdir():
            for content in class_dir.iterdir():
                # content: e.g. 'ECAD_dataset/dataset_1/dimensions.yml'
                if not content.is_dir():
                    continue
                # content | train_val_test_dir: e.g. 'ECAD_dataset/dataset_1/train'
                train_val_test_dir: Path = content
                # background_component_dir: e.g. 'ECAD_dataset/dataset_1/Background'
                for background_component_dir in train_val_test_dir.iterdir():
                    if background_component_dir.name == "Background":
                        self.__copy_images(
                            background_component_dir, 
                            'background', 
                            get_train_valid_test
                        )
                    else:  # background_component_dir.name == "Component"
                        self.__copy_images(
                            background_component_dir, class_dir.name, 
                            get_train_valid_test
                        )

    def remove_original_dataset(self):
        """Remove the original dataset directory."""
        shutil.rmtree(self.__original_dataset_path)

In [None]:
dataset = Dataset(ORIGINAL_DATASET_PATH, DATASET_PATH)
dataset.restructure()