### Running on Python 3.X

In [None]:
from gtsrb_loader.get_folderpath import get_folderpath
from gtsrb_loader.load_data import load_data

import numpy as np
import pandas as pd
import os
from fnmatch import fnmatch

# from cs231n
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from __future__ import print_function

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

Import GTSRB and place it at the height of your git repo, will download from the website if necessary

In [None]:
absolute_path = get_folderpath(subset='train', original_images=True)

In [None]:
if 'X_train' not in locals(): # checks if the variable has been set
    X_train, y_train = load_data(absolute_path)
    X_train, y_train = np.array(X_train), np.array(y_train).astype(int)

In [None]:
X_train.shape, y_train.shape

Plot a high resolution image of each traffic sign class

In [None]:
num_classes = len(np.unique(y_train))

In [None]:
for i in range(num_classes):
    plt.subplot(7, 7, i+1)
    plt.axis('off')
    pics_of_class = X_train[y_train == i]
    pic = pics_of_class[-1]
    # pic = pics_of_class[np.random.choice(range(pics_of_class.shape[0]), 1)[0]] # unwraps the returned list
    plt.imshow(pic.astype('uint8'))
plt.show()

Remember that the dataset includes 30 images of each real-life traffic sign from approaching perspectives. 
Each of the 30 pictures has also a different size in pixels, notice thats why the quality increases as we approach the latest images.

In [None]:
df = pd.read_csv(os.path.join(absolute_path, '00000', 'GT-00000.csv'), sep=';')
x = df['Roi.X1']
y = df['Roi.Y1']
height = df['Roi.Y2'] - y 
width = df['Roi.X2'] - x

In [None]:
for i in range(30):
    ax = plt.subplot(5, 6, i+1)
    plt.axis('off')
    plt.imshow(X_train[i].astype('uint8'))
    rect = patches.Rectangle((x[i], y[i]), width[i], height[i] ,linewidth=1,edgecolor='y',facecolor='none')
    # Add the patch to the Axes
    ax.add_patch(rect)
plt.show()

We will resize all pictures to an appropiate uniform size to feed them to the CNN. Lets inspect the current size of our images. 

In [None]:
csv_list = []
for root, dirs, files in os.walk(absolute_path):
    for file in files:
        if fnmatch(file, '*.csv'):
            csv_list.append(pd.read_csv(os.path.join(root, file), sep=';'))
df = pd.concat(csv_list)
df[:4]

In [None]:
ax_list = df[['Width', 'Height']].hist(bins=100)
ax_list[0][0].set_xlim(20, 70) # zoom in
ax_list[0][1].set_xlim(20, 70)

We must resize to a uniform size to push the images through a CNN. Also, to preserve most information of our camera images when downsizing from their original size we chose resize 64x64 which is a kind of $2^n$. size, and seems to be beneficial computationally.

64x64 means we are actually upsizing most of our training images from GTSRB, as you see the average size is around 40x40px.

### Plot a histogram of over the images in each class

In [None]:
plt.bar(range(num_classes), height=np.bincount(y_train))
plt.title('Images per class')
plt.show()
np.bincount(y_train)

As we see, some traffic sign are heavily overrepresented compared to others.