# Exploratory Data Analysis

In [3]:
import xml.etree.ElementTree as ET

from xml.dom.minidom import parse, parseString
from pathlib import Path

import matplotlib.pyplot as plt
import pandas as pd
import rootutils
import torch
import torchvision
from hydra import compose, initialize


In [4]:
rootutils.setup_root(
    Path.cwd().parent,  # path to the root directory,
    indicator=".project-root",
    project_root_env_var=True,  # set the PROJECT_ROOT environment variable to root directory
    dotenv=True,  # load environment variables from .env if exists in root directory
    pythonpath=True,  # add root directory to the PYTHONPATH (helps with imports)
    cwd=True,  # change current working directory to the root directory (helps with filepaths)
)


PosixPath('/home/ultron/AI/practice-projects/CV/SmartWaste-Detect-E2E')

In [5]:
%load_ext autoreload
%autoreload 2


In [6]:
with initialize(version_base="1.2", config_path="../configs"):
    cfg = compose(config_name="train")
    print(cfg.paths)


{'root_dir': '${oc.env:PROJECT_ROOT}', 'data_dir': '${paths.root_dir}/data/', 'log_dir': '${paths.root_dir}/logs/', 'output_dir': '${hydra:runtime.output_dir}', 'work_dir': '${hydra:runtime.cwd}'}


## Download the dataset

In [7]:
datasets_dir = Path(cfg.paths.data_dir) / cfg.data.dataset_dir_name
datasets_dir

PosixPath('/home/ultron/AI/practice-projects/CV/SmartWaste-Detect-E2E/data/Waste-Segregation-Image-Dataset-2')

In [8]:
if not datasets_dir.exists():
    from roboflow import Roboflow

    rf = Roboflow()

    project = rf.workspace("waste-segregation-image-dataset").project("waste-segregation-image-dataset")
    version = project.version(2)
    dataset = version.download("voc", str(datasets_dir))


In [9]:
TRAIN_DIR = datasets_dir / "train"
VAL_DIR = datasets_dir / "val"

In [14]:
def prepare_datsets(ds_dir: Path) -> pd.DataFrame:
    """
    This function will read image information from xml files and convert it to a pandas dataframe.
    """
    datasets_dict = {
        "filename": [],
        "width": [],
        "height": [],
        "class_name": [],
        "xmin": [],
        "ymin": [],
        "xmax": [],
        "ymax": [],
    }
    for xml_file in ds_dir.iterdir():
        if xml_file.suffix == ".xml":
            tree = ET.parse(xml_file)
            filename = tree.find("filename").text
            size = tree.find("size")
            width = int(size.find("width").text)
            height = int(size.find("height").text)

            for obj in tree.findall("object"):
                class_name = obj.find("name").text
                bndbox = obj.find("bndbox")
                xmin = int(bndbox.find("xmin").text)
                ymin = int(bndbox.find("ymin").text)
                xmax = int(bndbox.find("xmax").text)
                ymax = int(bndbox.find("ymax").text)
                datasets_dict["filename"].append(filename)
                datasets_dict["width"].append(width)
                datasets_dict["height"].append(height)
                datasets_dict["class_name"].append(class_name)
                datasets_dict["xmin"].append(xmin)
                datasets_dict["ymin"].append(ymin)
                datasets_dict["xmax"].append(xmax)
                datasets_dict["ymax"].append(ymax)

    return pd.DataFrame(datasets_dict)


df = prepare_datsets(TRAIN_DIR)
df.head()

Unnamed: 0,filename,width,height,class_name,xmin,ymin,xmax,ymax
0,metal_cans_147_jpg.rf.b76a5eec120f561afe4a82e8...,640,640,metal_cans,224,174,485,477
1,food_waste_6304_jpg.rf.fa839a07564577870e46656...,640,640,food_waste,1,119,235,345
2,food_waste_6304_jpg.rf.fa839a07564577870e46656...,640,640,food_waste,1,360,271,627
3,food_waste_6304_jpg.rf.fa839a07564577870e46656...,640,640,food_waste,225,95,450,254
4,food_waste_6304_jpg.rf.fa839a07564577870e46656...,640,640,food_waste,230,271,478,503


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1518 entries, 0 to 1517
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   filename    1518 non-null   object
 1   width       1518 non-null   int64 
 2   height      1518 non-null   int64 
 3   class_name  1518 non-null   object
 4   xmin        1518 non-null   int64 
 5   ymin        1518 non-null   int64 
 6   xmax        1518 non-null   int64 
 7   ymax        1518 non-null   int64 
dtypes: int64(6), object(2)
memory usage: 95.0+ KB


In [17]:
df.class_name.unique()

array(['metal_cans', 'food_waste', 'ewaste', 'plastic_bottles',
       'paper_waste', 'leaf_waste', 'wood_waste', 'plastic_bags'],
      dtype=object)