# Data Preparation
This notebook serves the purpose of loading the data from the datasets folder. The datasets for detection and recognition are German Traffic Sign Detection Benchmark (GTSDB) and the German Traffic Sign Recognition Benchmark (GTSRB) used in the online competition by IEEE International Joint Conference on Neural Networks.

## Recognition Dataset

In [7]:
import numpy
import pandas as pd
import cv2
from skimage import io,color, exposure, transform
import os
import random

### Helper Functions

#### Function for loading images from folder recursively
Images are cropped according to bounding boxes and resized to 32x32 to maintain uniformity in image sizes. Because of the recursive nature it goes into all the directories and loads the images. The bounding boxes are provided by the dataset

In [8]:
images = []
row=0

def load_images_from_folder(folder,df):
    global row
    for filename in os.listdir(folder):
        if os.path.isdir(os.path.join(folder,filename)):
            load_images_from_folder(os.path.join(folder,filename),df)
        if os.path.isdir(os.path.join(folder,filename)) == False and filename[-3:] == 'ppm':
            img = io.imread(os.path.join(folder,filename),as_grey=False)
            if img is not None:
                cropped_image = crop_the_image(img,df,row)
                resized_cropped_image = cv2.resize(cropped_image,(32,32))
                images.append(resized_cropped_image)
                row=row+1

            if len(images)%1000==0:
                print str(len(images))+' images loaded.'


    return images

#### Function for cropping the images according to the bounding boxes

**Roi.X1** is the lower x coordinate.

**Roi.X2** is the higher x coordinate

**Roi.Y1** is the lower y coordinate

**Roi.Y2** is the higher y coordinate

In [9]:
def crop_the_image(image,df,i):
    x1=df.loc[i]['Roi.X1']
    y1=df.loc[i]['Roi.Y1']
    x2=df.loc[i]['Roi.X2']
    y2=df.loc[i]['Roi.Y2']
    img = image[y1:y2,x1:x2]
    return img

#### Function for getting dataframe of bounding boxes

The bounding boxes for each of the sign class is specified in a separate csv file. Hence, all of them are loaded and concatenated into a single, large dataframe called **df**.

In [10]:

def get_df_from_csv(train=True,detect=False):
    if detect==False:
        if train == True:
            df=pd.read_csv('datasets/GTSRB/Final_Training/Images/00000/GT-00000.csv')
            for i in xrange(1,43):
                if i < 10:
                    df2=pd.read_csv('datasets/GTSRB/Final_Training/Images/0000'+str(i)+'/GT-0000'+str(i)+'.csv')
                else:
                    df2=pd.read_csv('datasets/GTSRB/Final_Training/Images/000'+str(i)+'/GT-000'+str(i)+'.csv')
                df3=[df,df2]
                df=pd.concat(df3,ignore_index=True)
        else:
            df = pd.read_csv('datasets/GTSRB_Test/Final_Test/Images/GT-final_test.test.csv')
    else:
        df=pd.read_csv('datasets/FullIJCNN2013/gt_csv.csv')


    return df

In [11]:
global row

bounding_box_df = get_df_from_csv()
print "Got CSV files"
d = load_images_from_folder('datasets/GTSRB',bounding_box_df)
data = numpy.array(d)
labels = numpy.array(bounding_box_df['ClassId'])
print ''
print "Finished loading training data."
print "=============================================="
del images[:]
row = 0

test_bounding_box_df = get_df_from_csv(train=False)
test_d = load_images_from_folder('datasets/GTSRB_Test',test_bounding_box_df)
test_data = numpy.array(test_d)

print ''
print "Finished loading testing data."


Got CSV files
1000 images loaded.
2000 images loaded.
3000 images loaded.
4000 images loaded.
5000 images loaded.
6000 images loaded.
7000 images loaded.
8000 images loaded.
9000 images loaded.
10000 images loaded.
11000 images loaded.
12000 images loaded.
13000 images loaded.
14000 images loaded.
15000 images loaded.
16000 images loaded.
17000 images loaded.
18000 images loaded.
19000 images loaded.
20000 images loaded.
21000 images loaded.
22000 images loaded.
23000 images loaded.
24000 images loaded.
25000 images loaded.
26000 images loaded.
27000 images loaded.
28000 images loaded.
29000 images loaded.
30000 images loaded.
31000 images loaded.
32000 images loaded.
33000 images loaded.
34000 images loaded.
35000 images loaded.
36000 images loaded.
37000 images loaded.
38000 images loaded.
39000 images loaded.

Finished loading training data.
1000 images loaded.
2000 images loaded.
3000 images loaded.
4000 images loaded.
5000 images loaded.
6000 images loaded.
7000 images loaded.
800

In [12]:
numpy.save('storage/data_file.npy',data)
numpy.save('storage/labels_file.npy',labels)
numpy.save('storage/test_data_file.npy',test_data)

print "Saved Data Successfully"

Saved Data Successfully


## Detection Dataset

### Helper Functions

#### Function for loading images from folder recursively

The images are loaded recursively from the dataset folder. They also cropped and positive and negative samples are extracted from them simultaneously. As some images have more than one traffic sign in the image, a while loop is used to ensure all the signs are cropped and saved as positive samples.

In [13]:
images = []
row=0
sign_labels = []

def load_detect_images_from_folder(folder,df):
    global row



    for filename in os.listdir(folder):
        if os.path.isdir(os.path.join(folder,filename)):
            load_images_from_folder(os.path.join(folder,filename),df)
        if os.path.isdir(os.path.join(folder,filename)) == False and filename[-3:] == 'ppm':
            img = io.imread(os.path.join(folder,filename),as_grey=False)
            if img is not None and (filename==df.loc[row]['Filename']):

                if row < 1212:
                    if df.loc[row]['Filename'] == df.loc[row+1]['Filename']:
                        while True:
                            cropped_image= positive_example(img,df,row)
                            resized_cropped_image = cv2.resize(cropped_image,(32,32))
                            images.append(resized_cropped_image)
                            sign_labels.append(1)
                            negative_images = negative_examples(img,df,row)
                            for i in negative_images:
                                images.append(i)
                            for i in range(6):
                                sign_labels.append(0)

                            if df.loc[row]['Filename'] != df.loc[row+1]['Filename']:
                                row=row+1
                                break
                            row=row+1
                    else:
                        cropped_image= positive_example(img,df,row)
                        resized_cropped_image = cv2.resize(cropped_image,(32,32))
                        images.append(resized_cropped_image)
                        sign_labels.append(1)
                        negative_images = negative_examples(img,df,row)
                        for i in negative_images:
                            images.append(i)
                        for i in range(6):
                            sign_labels.append(0)
                        row=row+1


    return (images,sign_labels)

#### Functions for generating positive and negative examples

Positive samples are just extracted by simple cropping the image according to the bounding boxes. Negative samples are taken in a different method. Random numbers within the image size limits are generated in the **generate(limit,low,high)** function. Care is taken to avoid taking the same numbers as the one in the bounding boxes. These random numbers specify the coordinates using which random patches of the image are cropped to be used as negative samples

In [14]:
def positive_example(image,df,i):
    x1=df.loc[i]['Roi.X1']
    y1=df.loc[i]['Roi.Y1']
    x2=df.loc[i]['Roi.X2']
    y2=df.loc[i]['Roi.Y2']
    return image[y1:y2,x1:x2]

def negative_examples(image,df,i):
    x1=df.loc[i]['Roi.X1']
    y1=df.loc[i]['Roi.Y1']
    x2=df.loc[i]['Roi.X2']
    y2=df.loc[i]['Roi.Y2']
    rx = generate(image.shape[1],x1,x2)
    ry = generate(image.shape[0],y1,y2)
    img_arr = []
    for i,j in zip(rx,ry):
        img_arr.append(image[j:j+32,i:i+32])
    return img_arr

def generate(limit,low,high):
    numbers = []
    for j in range(6):
        i = low
        while i<=high and i>=low:
            i = random.randint(0,limit-32)
        numbers.append(i)
    return numbers

In [15]:
bounding_box_df = get_df_from_csv(detect=True)
print "Got CSV files"
d,labels = load_detect_images_from_folder('datasets/FullIJCNN2013',bounding_box_df)
data = numpy.array(d)
classes = numpy.array(bounding_box_df['ClassId'])

print "Finished loading detection data."

numpy.save('storage/detect_data_file.npy',data)
numpy.save('storage/detect_labels_file.npy',labels)
numpy.save('storage/detect_classes_file.npy',classes)


print "Saved Data Successfully"

Got CSV files
Finished loading detection data.
Saved Data Successfully
