# Dataset Prepare

In [None]:
import xml.etree.ElementTree as ET

from xml.dom.minidom import parse, parseString
from pathlib import Path
import lightning as L
import matplotlib.pyplot as plt
import pandas as pd
import rootutils
import torch
import torchvision
from hydra import compose, initialize


In [None]:
rootutils.setup_root(
    Path.cwd().parent,  # path to the root directory,
    indicator=".project-root",
    project_root_env_var=True,  # set the PROJECT_ROOT environment variable to root directory
    dotenv=True,  # load environment variables from .env if exists in root directory
    pythonpath=True,  # add root directory to the PYTHONPATH (helps with imports)
    cwd=True,  # change current working directory to the root directory (helps with filepaths)
)


FileNotFoundError: Project root directory not found. Indicators: ['.project-root']

In [None]:
%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
with initialize(version_base="1.2", config_path="../configs"):
    cfg = compose(config_name="train")
    print(cfg.paths)


{'root_dir': '${oc.env:PROJECT_ROOT}', 'data_dir': '${paths.root_dir}/data/', 'log_dir': '${paths.root_dir}/logs/', 'output_dir': '${hydra:runtime.output_dir}', 'work_dir': '${hydra:runtime.cwd}'}


## Download the dataset

In [None]:
datasets_dir = Path(cfg.paths.data_dir) / cfg.data.dataset_dir_name
datasets_dir

PosixPath('/home/ultron/AI/practice-projects/CV/SmartWaste-Detect-E2E/data/Waste-Segregation-Image-Dataset-2')

In [None]:
if not datasets_dir.exists():
    from roboflow import Roboflow

    rf = Roboflow()

    project = rf.workspace("waste-segregation-image-dataset").project("waste-segregation-image-dataset")
    version = project.version(4)
    dataset = version.download("voc", str(datasets_dir))


In [None]:
TRAIN_DIR = datasets_dir / "train"
VAL_DIR = datasets_dir / "val"

In [None]:
def prepare_datsets(ds_dir: Path) -> pd.DataFrame:
    """
    This function will read image information from xml files and convert it to a pandas dataframe.
    """
    datasets_dict = {
        "filename": [],
        "width": [],
        "height": [],
        "class_name": [],
        "xmin": [],
        "ymin": [],
        "xmax": [],
        "ymax": [],
    }
    for xml_file in ds_dir.iterdir():
        if xml_file.suffix == ".xml":
            tree = ET.parse(xml_file)
            filename = tree.find("filename").text
            size = tree.find("size")
            width = int(size.find("width").text)
            height = int(size.find("height").text)
            for obj in tree.findall("object"):
                datasets_dict["filename"].append(filename)
                datasets_dict["width"].append(width)
                datasets_dict["height"].append(height)
                class_name = obj.find("name").text
                bndbox = obj.find("bndbox")
                xmin = int(bndbox.find("xmin").text)
                ymin = int(bndbox.find("ymin").text)
                xmax = int(bndbox.find("xmax").text)
                ymax = int(bndbox.find("ymax").text)
                datasets_dict["class_name"].append(class_name)
                datasets_dict["xmin"].append(xmin)
                datasets_dict["ymin"].append(ymin)
                datasets_dict["xmax"].append(xmax)
                datasets_dict["ymax"].append(ymax)

    return pd.DataFrame(datasets_dict)


df = prepare_datsets(TRAIN_DIR)
df.head()

Unnamed: 0,filename,width,height,class_name,xmin,ymin,xmax,ymax
0,metal_cans_27_jpg.rf.78a33b699c7b211a37c33995d...,640,640,metal_cans,260,1,591,504
1,plastic_bottles_30_jpg.rf.537b72469674378988c4...,640,640,plastic_bottles,46,15,640,641
2,food_waste_6733_jpg.rf.75ab9250c2085a128373624...,640,640,food_waste,55,1,592,641
3,leaf_waste_104_jpg.rf.9b126452f14c63a95bc8c16f...,640,640,leaf_waste,23,96,586,577
4,wood_waste_54_jpg.rf.dd07e7543e7b5746b6893f0f6...,640,640,wood_waste,280,1,641,517


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4017 entries, 0 to 4016
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   filename    4017 non-null   object
 1   width       4017 non-null   int64 
 2   height      4017 non-null   int64 
 3   class_name  4017 non-null   object
 4   xmin        4017 non-null   int64 
 5   ymin        4017 non-null   int64 
 6   xmax        4017 non-null   int64 
 7   ymax        4017 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 251.2+ KB
