# EDA for Happy Wheel dataset

In [None]:
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm
import plotly.graph_objects as go
import cv2
import albumentations as A
import numpy as np
import pandas as pd
import plotly.io as pio
pio.renderers.default='iframe'

In [None]:
csv_path = Path("/home/kitamura/dataset/HappyWheel/train.csv")
df = pd.read_csv(csv_path)

In [None]:
input_dir = Path("/home/kitamura/dataset/HappyWheel/train_images")
images = [f.name for f in input_dir.glob("**/*.jpg")]

In [None]:
species = df.species.unique()
species.sort()
species

## Check image size and aspect ratio

I found that the image size is different for each images in the dataset.  
Also, aspect ratio (width / height) varies drastically from 1.0 to 15.0.  
This observation suggest that when resizeing image, changing aspect ratio is bad idea.  
So, instead of changing aspect ratio, padding should be used.  
Image resizing vs Padding is also discussed in [stackoverflow](https://stackoverflow.com/a/61930818/14199112).

### Hypothesis

- Mean aspect ratio may be different between each species (classes).
  => By visualizing mean aspect ratio of each species, this hypothesis is rejected. No obvious trend is found.

In [None]:
aspect_ratios = []

def get_aspect_ratio(path):
    img = cv2.imread(str(input_dir / path))
    h, w, c = img.shape
    aspect = float(w) / h
    return (aspect, w, h)


with ThreadPoolExecutor(16) as executor:
    futures = executor.map(get_aspect_ratio, images)
    res = [f for f in futures]

In [None]:
widths = [r[1] for r in res]
heights = [r[2] for r in res]
aspects = [r[0] for r in res]

df["width"] = widths
df["height"] = heights
df["aspect"] = aspects

In [None]:
import plotly.express as px

fig = px.box(df, x="species", y="aspect")
fig.show()

In [None]:
import plotly.express as px

for spec in species:
    fig = px.scatter(df[df.species == spec], x="width", y="height", color="species")
    fig.update_xaxes(range=[0, 4000])
    fig.update_yaxes(range=[0, 4000])
    fig.show()