# Exploratory Data Analysis (EDA)

In [None]:
from pathlib import Path
import csv
import pandas as pd

## Dataset

Create manually a paths.csv file in /data that contains on each the absolute path to a dataset:\
DatasetName,PathToDataset

The csv is read in

In [None]:
paths_to_data = {}

with open('data/paths.csv', newline='') as csvfile:
    pathsreader = csv.reader(csvfile, delimiter=';')
    for row in pathsreader:
        paths_to_data[row[0]] = Path(row[1])

Set paths

In [None]:
path_cityscapes = paths_to_data["Cityscapes"]
path_cityscapes_leftImg8bit = path_cityscapes / 'leftImg8bit_trainvaltest (11GB)' / 'leftImg8bit'
path_cityscapes_rightImg8bit = path_cityscapes / 'rightImg8bit_trainvaltest (11GB)' / 'rightImg8bit'
path_cityscapes_disparity = path_cityscapes / 'disparity_trainvaltest (3.5GB)'

path_train = path_cityscapes_leftImg8bit / 'train'
path_val = path_cityscapes_leftImg8bit / 'val'
path_test = path_cityscapes_leftImg8bit / 'test'

In [70]:
def explore_set(path_set):
    """
    Returns a dataframe with the cities and number of images for each city 
    for the given dataset
    
    Args:
        path_set: path to the dataset split
        
    Returns:
        A pandas dataframe with two columns:
        - Cities
        - Number of images
    """

    cities = [x.stem for x in path_set.iterdir() if x.is_dir()]
    number_of_images = []

    for city in cities:
        path_city = path_set / city
        number_of_images.append(len(list(path_city.glob('*.png'))))

    dict = {'City': cities, 'Number of images': number_of_images}
    df = pd.DataFrame(dict)

    return df

In [71]:
df_num_cities_train = explore_set(path_train)
df_num_cities_val = explore_set(path_val)
df_num_cities_test = explore_set(path_test)

In [73]:
print('Total number of images:')
print("Training set: ", df_num_cities_train['Number of images'].sum())
print("Validation set: ", df_num_cities_val['Number of images'].sum())
print("Test set: ", df_num_cities_test['Number of images'].sum())

Total number of images:
Training set:  2975
Validation set:  500
Test set:  1525


In [77]:
df_num_cities_test.count()

City                6
Number of images    6
dtype: int64

In [79]:
import tabulate

In [92]:
print(df_num_cities_train.to_markdown(index=False,tablefmt="github"))
print()
print(df_num_cities_val.to_markdown(index=False,tablefmt="github"))
print()
print(df_num_cities_test.to_markdown(index=False,tablefmt="github"))

| City            |   Number of images |
|-----------------|--------------------|
| aachen          |                174 |
| bochum          |                 96 |
| bremen          |                316 |
| cologne         |                154 |
| darmstadt       |                 85 |
| dusseldorf      |                221 |
| erfurt          |                109 |
| hamburg         |                248 |
| hanover         |                196 |
| jena            |                119 |
| krefeld         |                 99 |
| monchengladbach |                 94 |
| strasbourg      |                365 |
| stuttgart       |                196 |
| tubingen        |                144 |
| ulm             |                 95 |
| weimar          |                142 |
| zurich          |                122 |

| City      |   Number of images |
|-----------|--------------------|
| frankfurt |                267 |
| lindau    |                 59 |
| munster   |                174 |

| C