### Running on Python 3.X

In [None]:
from website_load_module.readTrafficSigns import readTrafficSigns
import numpy as np
import pandas as pd

# from cs231n
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from __future__ import print_function

# to import GTSRB data
import os
import math
import zipfile
import urllib.request
import sys

# This is a bit of magic to make matplotlib figures appear inline in the notebook
# rather than in a new window.
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # set default size of plots
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'

# Some more magic so that the notebook will reload external python modules;
# see http://stackoverflow.com/questions/1907993/autoreload-of-modules-in-ipython
%load_ext autoreload
%autoreload 2

Import GTSRB and place it at the height of your git repo, will download from the website if necessary

In [None]:
# convention: we store our GTSRB folder at the same height as the ML repo
absolute_path = os.path.abspath(os.path.join(os.path.realpath('.'), '..', '..', 'GTSRB', 'Final_Training', 'Images'))
if not os.path.isdir(absolute_path):
    print("You do not have the GTSRB dataset in the desired location, downloading if for you")
    folder_path = os.path.join(os.path.realpath('.'),'../../')
    filepath = os.path.join(folder_path,'GTSRB_Final_Training_Images.zip')
    urlfile = 'http://benchmark.ini.rub.de/Dataset/GTSRB_Final_Training_Images.zip'
    req = urllib.request.urlopen(urlfile)
    total_size = int(req.getheader('Content-Length').strip())
    downloaded = 0
    CHUNK = 256 * 10240
    with open(filepath, 'wb') as fp:
        while True:
            chunk = req.read(CHUNK)
            downloaded += len(chunk)
            sys.stdout.write('\r'+'Downloaded '+str(math.floor( (downloaded / total_size) * 100 ))+'%')
            if not chunk: break
            fp.write(chunk)
    zip_ref = zipfile.ZipFile(filepath,'r')
    zip_ref.extractall(folder_path)
    zip_ref.close()
    os.remove(filepath)
    sys.stdout.write('\n')
    print("Extracted GTSRB dataset for you.")
else:
    print("GTSRB dataset is in place, you're fine")

In [None]:
X_train, y_train = readTrafficSigns(absolute_path)
X_train, y_train = np.array(X_train), np.array(y_train).astype(int)

In [None]:
X_train.shape, y_train.shape

 Plot a random image of each traffic sign class

In [None]:
num_classes = len(np.unique(y_train))

In [None]:
for i in range(num_classes):
    plt.subplot(7, 7, i+1)
    plt.axis('off')
    pics_of_class = X_train[y_train == i]
    pic = pics_of_class[np.random.choice(range(pics_of_class.shape[0]), 1)[0]] # unwraps the returned list
    plt.imshow(pic.astype('uint8'))
plt.show()

Remember that the dataset includes 30 images of each real-life traffic sign from approaching perspectives. 
Each of the 30 pictures has also a different size in pixels, notice thats why the quality increases as we approach the latest images.

In [None]:
df = pd.read_csv(os.path.join(absolute_path, '00000', 'GT-00000.csv'), sep=';')
x = df['Roi.X1']
y = df['Roi.Y1']
height = df['Roi.Y2'] - y
width = df['Roi.X2'] - x

In [None]:
for i in range(30):
    ax = plt.subplot(5, 6, i+1)
    plt.axis('off')
    plt.imshow(X_train[i].astype('uint8'))
    rect = patches.Rectangle((x[i], y[i]), width[i], height[i] ,linewidth=1,edgecolor='y',facecolor='none')
    # Add the patch to the Axes
    ax.add_patch(rect)
plt.show()

We will resize all pictures to an appropiate uniform size to feed them to the CNN. Lets inspect the current size of our images. 

In [None]:
import pandas as pd
df = pd.read_csv('generated_concat_csv.csv', sep=';')
df[:4]

In [None]:
ax_list = df[['Width', 'Height']].hist(bins=100)
ax_list[0][0].set_xlim(20, 70) # zoom in
ax_list[0][1].set_xlim(20, 70)

By resizing to 32x32 we would lie about in the middle of the size spectrum and keep a size like $2^n$. Which seems to be beneficial computationally

### Plot a histogram of over the images in each class

In [None]:
plt.hist(y_train, bins='auto')
plt.title('Images per class')
plt.show()

As we see, some traffic sign are heavily overrepresented compared to others.