<a href="https://colab.research.google.com/github/tortoisehare/TSR-GAN/blob/master/TSRGAN_dataprep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Author: Stephanie Tietz

Collection of code used to import German TSR dataset (42 classes) as well as a CIFAR10 (10 classes) batch, combine data using a label of '43' for all CIFAR10 images

Output 1: larger dataset with all classes

Output 2: smaller dataset with only 3 classes

Purpose: Preliminary classification will be done on a smaller number of classes (30km/hr and 80km/hr signs) and both sets will include a class of "not a traffic sign" images, which is made up of random CIFAR10 images

In [1]:
from google.colab import files

uploaded = files.upload()


Saving test.p to test.p
Saving train.p to train.p
Saving data_batch_1.p to data_batch_1.p


In [0]:
import pandas as pd
import numpy as np
#import torch
#import torch.nn as nn

In [0]:
# Load pickled traffic sign data
import pickle


training_file = 'train.p'
testing_file = 'test.p'

with open(training_file, mode='rb') as f:
    tstrain = pickle.load(f, encoding='bytes')
with open(testing_file, mode='rb') as f:
    tstest = pickle.load(f, encoding='bytes')


In [6]:
#convert to consolidated numpy array for later
#NOTE: pickled version of images have shape (n, 32, 32, 3)
#images denoted by dictionary name 'features'

traindata = np.array(tstrain['features'])
print(traindata.shape)
traindata = traindata.T.reshape(traindata.shape[0],-1)
print(traindata.shape)

testdata = np.array(tstest['features'])
print(testdata.shape)
testdata = testdata.T.reshape(testdata.shape[0],-1)
print(testdata.shape)

#pickled version of labels have shape (n,), dict name 'labels'
trainlabels = np.array(tstrain['labels'])
trainlabels = trainlabels.reshape(traindata.shape[0],1)
#print(trainlabels.shape)
testlabels = np.array(tstest['labels'])
testlabels = testlabels.reshape(testdata.shape[0],1)
print(testlabels.shape)
print(testlabels)

#add labels as column to end of data, check shape
tsrtrain = np.concatenate((traindata,trainlabels),axis=1) #39209x3073
tsrtest = np.concatenate((testdata,testlabels),axis=1) #12630x3073
print(tsrtest.shape)

(39209, 32, 32, 3)
(39209, 3072)
(12630, 32, 32, 3)
(12630, 3072)
(12630, 1)
[[16]
 [ 1]
 [38]
 ...
 [ 6]
 [ 7]
 [10]]
(12630, 3073)


In [0]:
#load sign names csv

#url = 'https://raw.githubusercontent.com/tortoisehare/TSR-GAN/master/signnames.csv'
#dfs = pd.read_csv(url)

import csv

sign_names = []
with open('signnames.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        sign_names.append(row[1])
    sign_names.reverse()
    sign_names.pop()
    sign_names.reverse()

In [7]:
#load CIFAR10 dataset (one of the test minibatches, 10,000 random images)
cifarfile = 'data_batch_1.p'

with open(cifarfile, mode='rb') as fo:
  cifar = pickle.load(fo,encoding='bytes')

#data will be 10000x3072 numpy array, each row storing 32x32 colour image
#labels will be a list of 10,000 numbers in range 0-9

#make all labels 43
labels = np.full((10000,1), 43)
#print(labels)


#pull out numpy array of images
images = np.array(cifar[b'data']) #dict name has b due to bytes-encoding
#print(images)

#add labels as last column to dataset
cifartot = np.concatenate((images,labels),axis=1)
#print(cifartot)

#split into train set (1900 images) and test set (700 images)
#dataset randomly ordered so will get different classes
#(though maybe not all ten)
cifartrain = cifartot[:1900]    #1900x3073
cifartest = cifartot[1901:2601] #700x3073
print(cifartest.shape)


(700, 3073)


In [8]:
#merge CIFAR10 images with TSR images

traindata = np.concatenate((cifartrain,tsrtrain)) #41109x3073
#print(traindata)
testdata = np.concatenate((cifartest,tsrtest)) #13330x3073
#print(testdata)

#convert numpy arrays to dataframe, add column name to labels
dftrain = pd.DataFrame(traindata,index=None,columns=None)
dftrain.rename(index=str, columns = {3072:"Labels"}, inplace=True)
print(dftrain.shape)
#dftrain.head()

dftest = pd.DataFrame(testdata,index=None,columns=None)
dftest.rename(index=str, columns = {3072:"Labels"}, inplace=True)
print(dftest.shape)
dftest.head()


(41109, 3073)
(13330, 3073)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3063,3064,3065,3066,3067,3068,3069,3070,3071,Labels
0,214,217,218,205,194,187,185,182,179,177,...,137,136,137,138,137,135,133,134,136,43
1,123,123,126,132,133,133,128,120,118,122,...,55,42,52,60,41,37,42,51,47,43
2,98,98,93,85,74,63,58,59,66,74,...,108,101,96,91,90,95,102,99,94,43
3,89,69,92,116,129,117,109,125,136,140,...,31,32,34,31,26,27,34,33,31,43
4,105,119,133,138,133,119,87,86,97,111,...,89,80,86,88,90,77,92,101,97,43


In [9]:
#How many items in each class?

print("Training set is: ")
print(dftrain.shape)
print(dftrain['Labels'].value_counts())

print("Test set is: ")
print(dftest.shape)
print(dftest['Labels'].value_counts())

Training set is: 
(41109, 3073)
2     2250
1     2220
13    2160
12    2100
38    2070
10    2010
4     1980
43    1900
5     1860
25    1500
9     1470
7     1440
8     1410
3     1410
11    1320
18    1200
35    1200
17    1110
31     780
14     780
33     689
15     630
26     600
28     540
23     510
30     450
34     420
6      420
16     420
22     390
36     390
40     360
20     360
21     330
39     300
24     270
29     270
32     240
42     240
41     240
27     240
37     210
19     210
0      210
Name: Labels, dtype: int64
Test set is: 
(13330, 3073)
2     750
1     720
13    720
43    700
12    690
38    690
10    660
4     660
5     630
9     480
25    480
8     450
3     450
7     450
11    420
35    390
18    390
17    360
14    270
31    270
33    210
15    210
26    180
6     150
16    150
30    150
23    150
28    150
36    120
34    120
22    120
24     90
39     90
42     90
20     90
21     90
29     90
40     90
32     60
27     60
41     60
19     60
37     60

In [10]:
#create larger dataset, store for later
alltrain = dftrain.to_numpy()
alltest = dftest.to_numpy()
print(alltrain.shape) #expect (41109,3073)
print(alltest.shape) #expect (13330,3073)

np.save('alltrain',alltrain)
np.save('alltest',alltest)

(41109, 3073)
(13330, 3073)


In [11]:
#only want class 1, 25, and 43 for the first attempt
#These are 30 km/hr circle, "road work" triangle, and "not a sign"
smalltrain = dftrain.loc[(dftrain["Labels"]==1)|(dftrain["Labels"]==25)|(dftrain["Labels"]==43)]
smalltest = dftest.loc[(dftest["Labels"]==1)|(dftest["Labels"]==25)|(dftest["Labels"]==43)]
#print(smalltrain.iloc[790:810])

smalltrain = smalltrain.replace({'Labels': {1:0, 43:2}}) 
#can't do overlapping indices, need to make 25 -> 1 separate
#print(smalltrain.iloc[1890:1920])
smalltrain = smalltrain.replace({'Labels': 25},1)
print(smalltrain['Labels'].value_counts()) #expect 0 2220, 1 1500, 2 1900


smalltest = smalltest.replace({'Labels': {1:0, 43:2}})
#print(smalltest.iloc[1890:1920])
smalltest = smalltest.replace({'Labels': 25},1)
print(smalltest['Labels'].value_counts()) #expect 0 720, 1 480, 2 700

#convert to 2D numpy array for use in classifiers
smalltrain2 = smalltrain.to_numpy()
smalltest2 = smalltest.to_numpy()
print(smalltrain.shape)
print(smalltest.shape)


0    2220
2    1900
1    1500
Name: Labels, dtype: int64
0    720
2    700
1    480
Name: Labels, dtype: int64
(5620, 3073)
(1900, 3073)


In [0]:
#store files in numpy array .npy extension
np.save('smalltrain2',smalltrain2)
np.save('smalltest2',smalltest2)

In [0]:
#Reference cell
#example of how to download files to local directory

from google.colab import files
#np.savetxt("traindata.csv", traindata, delimiter=",")
#files.download('traindata.csv')

files.download('smalltrain.npy')
files.download('smalltest.npy')

CIFAR10: 
This tech report (Chapter 3) describes the dataset and the methodology followed when collecting it in much greater detail. Please cite it if you intend to use this dataset. 

Learning Multiple Layers of Features from Tiny Images, Alex Krizhevsky, 2009.