# <a id='toc1_'></a>[Dataset Analysis](#toc0_)

**Table of contents**<a id='toc0_'></a>    
- [Dataset Analysis](#toc1_)    
  - [Get number of samples in each class for each set](#toc1_1_)    
    - [Group by set](#toc1_1_1_)    
  - [Get changes in number of samples for noisy and clean synthetic set](#toc1_2_)    
    - [Breakdown by classes](#toc1_2_1_)    
    - [Group by Interpolation Steps](#toc1_2_2_)    

<!-- vscode-jupyter-toc-config
	numbering=false
	anchor=true
	flat=false
	minLevel=1
	maxLevel=6
	/vscode-jupyter-toc-config -->
<!-- THIS CELL WILL BE REPLACED ON TOC UPDATE. DO NOT WRITE YOUR TEXT IN THIS CELL -->

In [1]:
import os
import pandas as pd

from tools import analysis

## <a id='toc1_1_'></a>[Get number of samples in each class for each set](#toc0_)

In [2]:
ds = ("imagenette", "imagewoof", "stanford-dogs")
types = ("train", "val", "synthetic", "synthetic-cleaned")

ds_classes = {
    "ds": [],
    "type": [],
    "class": [],
    "n_images": []
}

# count number of images in each dataset
for d in ds:
    for t in types:
        path = f"data/{d}/{t}"
        if os.path.exists(path):
            classes = os.listdir(path)
            for c in classes:
                n_images = len(os.listdir(f"{path}/{c}"))
                ds_classes["ds"].append(d)
                ds_classes["type"].append(t)
                ds_classes["class"].append(c)
                ds_classes["n_images"].append(n_images)
df = pd.DataFrame(ds_classes)
df.head()

Unnamed: 0,ds,type,class,n_images
0,imagenette,train,n01440764,963
1,imagenette,train,n02102040,955
2,imagenette,train,n02979186,993
3,imagenette,train,n03000684,858
4,imagenette,train,n03028079,941


In [3]:
df.to_csv("results/ds_classes.csv", index=False)

### <a id='toc1_1_1_'></a>[Group by set](#toc0_)

In [4]:
overview = df.groupby(["ds", "type"]).n_images.sum()
overview

ds             type             
imagenette     synthetic             9495
               synthetic-cleaned     8053
               train                 9469
               val                   3925
stanford-dogs  synthetic            12120
               train                12000
               val                   8580
woof           synthetic             9046
               synthetic-cleaned     6911
               train                 9025
               val                   3929
Name: n_images, dtype: int64

In [5]:
overview.to_csv("results/ds_overview.csv", header=True)

## <a id='toc1_2_'></a>[Get changes in number of samples for noisy and clean synthetic set](#toc0_)

In [6]:
# get changes in synthetic and synthetic-cleaned
synthetic = df[df.type == "synthetic"]
synthetic_cleaned = df[df.type == "synthetic-cleaned"]
synthetic = synthetic.groupby("ds").n_images.sum()
synthetic_cleaned = synthetic_cleaned.groupby("ds").n_images.sum()
synthetic = synthetic.reset_index()
synthetic_cleaned = synthetic_cleaned.reset_index()

synthetic = synthetic.rename(columns={"n_images": "n_images_synthetic"})
synthetic_cleaned = synthetic_cleaned.rename(columns={"n_images": "n_images_synthetic_cleaned"})
change = pd.merge(synthetic, synthetic_cleaned, on="ds")
change["abs_change"] = change.n_images_synthetic_cleaned - change.n_images_synthetic
change["rel_change"] = (change.n_images_synthetic_cleaned - change.n_images_synthetic) / change.n_images_synthetic
change.dropna()
change.to_csv("results/synthetic_changes.csv", index=False)
change

Unnamed: 0,ds,n_images_synthetic,n_images_synthetic_cleaned,abs_change,rel_change
0,imagenette,9495,8053,-1442,-0.151869
1,woof,9046,6911,-2135,-0.236016


### <a id='toc1_2_1_'></a>[Breakdown by classes](#toc0_)

In [7]:
# get breakdown of changes by class
change_by_class = df[df.type == "synthetic-cleaned"]
change_by_class = change_by_class.groupby(["ds", "class"]).n_images
change_by_class = change_by_class.sum().reset_index()
change_by_class = change_by_class.rename(columns={"n_images": "n_images_synthetic_cleaned"})
change_by_class = pd.merge(change_by_class, df[df.type == "synthetic"], on=["ds", "class"])
change_by_class = change_by_class.rename(columns={"n_images": "n_images_synthetic"})
change_by_class["abs_change"] = change_by_class.n_images_synthetic_cleaned - change_by_class.n_images_synthetic
change_by_class["rel_change"] = (change_by_class.n_images_synthetic_cleaned - change_by_class.n_images_synthetic) / change_by_class.n_images_synthetic
change_by_class.dropna()
change_by_class.to_csv("results/synthetic_changes_by_class.csv", index=False)
change_by_class

Unnamed: 0,ds,class,n_images_synthetic_cleaned,type,n_images_synthetic,abs_change,rel_change
0,imagenette,n01440764,645,synthetic,963,-318,-0.330218
1,imagenette,n02102040,896,synthetic,960,-64,-0.066667
2,imagenette,n02979186,971,synthetic,993,-22,-0.022155
3,imagenette,n03000684,408,synthetic,863,-455,-0.527231
4,imagenette,n03028079,920,synthetic,942,-22,-0.023355
5,imagenette,n03394916,720,synthetic,959,-239,-0.249218
6,imagenette,n03417042,841,synthetic,968,-127,-0.131198
7,imagenette,n03425413,935,synthetic,935,0,0.0
8,imagenette,n03445777,872,synthetic,952,-80,-0.084034
9,imagenette,n03888257,845,synthetic,960,-115,-0.119792


In [8]:
# get list of files in each dataset, type and class
ds_files = {
    "ds": [],
    "type": [],
    "class": [],
    "file": []
}

for d in ds:
    for t in types:
        path = f"data/{d}/{t}"
        if os.path.exists(path):
            classes = os.listdir(path)
            for c in classes:
                files = os.listdir(f"{path}/{c}")
                for f in files:
                    ds_files["ds"].append(d)
                    ds_files["type"].append(t)
                    ds_files["class"].append(c)
                    ds_files["file"].append(f)
df_files = pd.DataFrame(ds_files)
df_files.head()

Unnamed: 0,ds,type,class,file
0,imagenette,train,n01440764,ILSVRC2012_val_00000293.JPEG
1,imagenette,train,n01440764,ILSVRC2012_val_00002138.JPEG
2,imagenette,train,n01440764,ILSVRC2012_val_00003014.JPEG
3,imagenette,train,n01440764,ILSVRC2012_val_00006697.JPEG
4,imagenette,train,n01440764,ILSVRC2012_val_00007197.JPEG


### <a id='toc1_2_2_'></a>[Group by Interpolation Steps](#toc0_)

In [9]:
df_synthetics = df_files[(df_files.type == "synthetic")|(df_files.type == "synthetic-cleaned")].copy()
# file name is in seed-id_interpolation_step
df_synthetics["interpolation_step"] = df_synthetics['file'].apply(lambda x: x.split("_")[-1].split(".")[0])
df_synthetics["interpolation_step"] = df_synthetics["interpolation_step"].astype(int)
df_synthetics["interpolation_step"] = df_synthetics["interpolation_step"]
df_synthetics.head()

Unnamed: 0,ds,type,class,file,interpolation_step
13394,imagenette,synthetic,n01440764,18316237598377439927-0_6.jpg,6
13395,imagenette,synthetic,n01440764,18316237598377439927-100_4.jpg,4
13396,imagenette,synthetic,n01440764,18316237598377439927-101_0.jpg,0
13397,imagenette,synthetic,n01440764,18316237598377439927-102_1.jpg,1
13398,imagenette,synthetic,n01440764,18316237598377439927-103_4.jpg,4


In [10]:
# get breakdown of changes by interpolation step
df_synth = df_synthetics[df_synthetics.type == "synthetic"].copy()
df_synth_clean = df_synthetics[df_synthetics.type == "synthetic-cleaned"].copy()
df_synth = df_synth.groupby(["ds", "class", "interpolation_step"]).file.count().reset_index()
df_synth_clean = df_synth_clean.groupby(["ds", "class", "interpolation_step"]).file.count().reset_index()
df_synth = df_synth.rename(columns={"file": "n_images_synthetic"})
df_synth_clean = df_synth_clean.rename(columns={"file": "n_images_synthetic_cleaned"})
interpolation_change = pd.merge(df_synth, df_synth_clean, on=["ds", "class", "interpolation_step"])
interpolation_change = analysis.symmetrize_steps(interpolation_change)
interpolation_change['abs_change'] = interpolation_change.n_images_synthetic_cleaned - interpolation_change.n_images_synthetic
interpolation_change['rel_change'] = (interpolation_change.n_images_synthetic_cleaned - interpolation_change.n_images_synthetic) / interpolation_change.n_images_synthetic
interpolation_change["interpolation_step"] = interpolation_change["interpolation_step"] + 1
interpolation_change.to_csv("results/synthetic_changes_by_interpolation_step.csv", index=False)
interpolation_change.head()

Unnamed: 0,ds,class,interpolation_step,n_images_synthetic,n_images_synthetic_cleaned,abs_change,rel_change
0,imagenette,n01440764,1,122,95,-27,-0.221311
1,imagenette,n01440764,2,119,99,-20,-0.168067
2,imagenette,n01440764,3,121,95,-26,-0.214876
3,imagenette,n01440764,4,118,88,-30,-0.254237
4,imagenette,n01440764,5,121,94,-27,-0.22314


In [11]:
# group interpolation_change by ds
interpolation_change_by_ds = interpolation_change.groupby(["ds", "interpolation_step"]).sum().reset_index()
interpolation_change_by_ds['rel_change'] = interpolation_change_by_ds['abs_change'] / interpolation_change_by_ds['n_images_synthetic']
interpolation_change_by_ds.drop(columns=["class"], inplace=True)
interpolation_change_by_ds.to_csv("results/synthetic_changes_by_interpolation_step_by_ds.csv", index=False)
interpolation_change_by_ds.head()

Unnamed: 0,ds,interpolation_step,n_images_synthetic,n_images_synthetic_cleaned,abs_change,rel_change
0,imagenette,1,1190,1093,-97,-0.081513
1,imagenette,2,1189,1087,-102,-0.085786
2,imagenette,3,1174,1069,-105,-0.089438
3,imagenette,4,1178,1032,-146,-0.123939
4,imagenette,5,1166,1014,-152,-0.13036
