In [None]:
from pathlib import Path
from collections import defaultdict, Counter

import pandas as pd
import seaborn as sns
from PIL import Image, UnidentifiedImageError

In [None]:
data_path_irecord = Path("C:/data/nachtvlinderdata/IRecord")
data_path_telmee = Path("C:/data/nachtvlinderdata/telmee")

# CSVs

In [None]:
irecord_df = pd.read_csv(str(data_path_irecord / "mothsOriginal.csv"))
irecord_df.head()

In [None]:
irecord_df[["preferred_taxon", "default_common_name", "taxon_group", "order_taxon", "record_status"]].describe()

In [None]:
irecord_df = pd.read_csv(str(data_path_irecord / "moths_images" / "moths_images.csv"))
irecord_df.head()

In [None]:
irecord_df[["preferred_taxon", "default_common_name", "taxon_group", "order_taxon", "record_status"]].describe()

In [None]:
telmee_df = pd.read_csv(str(data_path_telmee / "export_ndff3.csv"))
telmee_df.head()

In [None]:
telmee_df.describe()

# Image folders

## IRecord

In [None]:
folder_path_irecord = data_path_irecord / "moths_images"
len(list(folder_path_irecord.iterdir()))

In [None]:
num_images_irecord = Counter({k: v for k, v in {p.name: len(list(p.iterdir())) for p in folder_path_irecord.iterdir() if p.is_dir()}.items() if v > 0})
num_images_irecord.most_common(10)

In [None]:
sns.histplot(num_images_irecord.values())

In [None]:
pd.DataFrame(num_images_irecord.values(), columns=["num images"]).describe()

In [None]:
sum(num_images_irecord.values())

In [None]:
# Takes long
resolutions_irecord = []
for class_path in folder_path_irecord.iterdir():
    
    if not class_path.is_dir():
        continue
    
    for image_path in class_path.iterdir():
        try:
            img = Image.open(image_path)
        except UnidentifiedImageError:
            continue
        width, height = img.size
        resolutions_irecord.append((width, height))

In [None]:
resolutions_irecord_df = pd.DataFrame(resolutions_irecord, columns=["width", "height"])
resolutions_irecord_df.describe()

In [None]:
sns.scatterplot(data=resolutions_irecord_df, x="width", y="height")

## Telmee

In [None]:
folder_path_telmee = data_path_telmee / "photos"
len(list(folder_path_telmee.iterdir()))

In [None]:
num_images_telmee = Counter({p.name: len(list(p.iterdir())) for p in folder_path_telmee.iterdir()})
num_images_telmee.most_common(10)

In [None]:
sns.histplot(num_images_telmee.values())

In [None]:
pd.DataFrame(num_images_telmee.values(), columns=["num images"]).describe()

In [None]:
sum(num_images_telmee.values())

In [None]:
# Takes long
resolutions_telmee = []
for class_path in folder_path_telmee.iterdir():
    for image_path in class_path.iterdir():
        try:
            img = Image.open(image_path)
        except (UnidentifiedImageError, PermissionError):
            continue
        width, height = img.size
        resolutions_telmee.append((width, height))

In [None]:
resolutions_telmee_df = pd.DataFrame(resolutions_telmee, columns=["width", "height"])
resolutions_telmee_df.describe()

In [None]:
sns.scatterplot(data=resolutions_telmee_df, x="width", y="height")

## Merged

In [None]:
sum(num_images_telmee.values()) + sum(num_images_irecord.values())

In [None]:
classes_irecord = {p.name for p in folder_path_irecord.iterdir()}
classes_telmee = {p.name for p in folder_path_telmee.iterdir()}

In [None]:
len(classes_irecord), len(classes_telmee), len(classes_irecord | classes_telmee), len(classes_irecord & classes_telmee), len(classes_irecord - classes_telmee),  len(classes_telmee - classes_irecord)

In [None]:
num_images_all = num_images_irecord + num_images_telmee
num_images_all.most_common(10)

In [None]:
sns.histplot(num_images_all.values())

In [None]:
pd.DataFrame(num_images_all.values(), columns=["num images"]).describe()