<a href="https://colab.research.google.com/github/tortoisehare/TSR-GAN/blob/master/TSRGAN_dataprep.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Author: Stephanie Tietz

Collection of code used to import German TSR dataset (42 classes) as well as a CIFAR10 (10 classes) batch, combine data using a label of '43' for all CIFAR10 images, and return smaller dataset with only 3 classes

Purpose: Preliminary classification on a smaller number of classes (30km/hr and 80km/hr signs) and including a class of "not a traffic sign" which is made up of random CIFAR10 images

In [7]:
from google.colab import files

uploaded = files.upload()


Saving test.p to test.p
Saving train.p to train.p
Saving data_batch_1.p to data_batch_1.p


In [0]:
import pandas as pd
import numpy as np
#import torch
#import torch.nn as nn

In [0]:
# Load pickled traffic sign data
import pickle


training_file = 'train.p'
testing_file = 'test.p'

with open(training_file, mode='rb') as f:
    tstrain = pickle.load(f, encoding='bytes')
with open(testing_file, mode='rb') as f:
    tstest = pickle.load(f, encoding='bytes')


In [10]:
#convert to consolidated numpy array for later
#NOTE: pickled version of images have shape (n, 32, 32, 3)
#images denoted by dictionary name 'features'

traindata = np.array(tstrain['features'])
#print(traindata.shape)
traindata = traindata.T.reshape(39209,32*32*3)
print(traindata.shape)

testdata = np.array(tstest['features'])
#print(testdata.shape)
testdata = testdata.T.reshape(12630,32*32*3)
#print(testdata.shape)

#pickled version of labels have shape (n,), dict name 'labels'
trainlabels = np.array(tstrain['labels'])
trainlabels = trainlabels.reshape(39209,1)
#print(trainlabels.shape)
testlabels = np.array(tstest['labels'])
testlabels = testlabels.reshape(12630,1)
print(testlabels.shape)
print(testlabels)

#add labels as column to end of data, check shape
tsrtrain = np.concatenate((traindata,trainlabels),axis=1) #39209x3073
tsrtest = np.concatenate((testdata,testlabels),axis=1) #12630x3073
print(tsrtest.shape)

(39209, 3072)
(12630, 1)
[[16]
 [ 1]
 [38]
 ...
 [ 6]
 [ 7]
 [10]]
(12630, 3073)


In [0]:
#load sign names csv

#url = 'https://raw.githubusercontent.com/tortoisehare/TSR-GAN/master/signnames.csv'
#dfs = pd.read_csv(url)

import csv

sign_names = []
with open('signnames.csv', 'r') as f:
    reader = csv.reader(f)
    for row in reader:
        sign_names.append(row[1])
    sign_names.reverse()
    sign_names.pop()
    sign_names.reverse()

In [14]:
#load CIFAR10 dataset (one of the test minibatches, 10,000 random images)
cifarfile = 'data_batch_1.p'

with open(cifarfile, mode='rb') as fo:
  cifar = pickle.load(fo,encoding='bytes')

#data will be 10000x3072 numpy array, each row storing 32x32 colour image
#labels will be a list of 10,000 numbers in range 0-9

#make all labels 43
labels = np.full((10000,1), 43)
#print(labels)


#pull out numpy array of images
images = np.array(cifar[b'data']) #dict name has b due to bytes-encoding
#print(images)

#add labels as last column to dataset
cifartot = np.concatenate((images,labels),axis=1)
#print(cifartot)

#split into train set (800 images) and test set (400 images)
#dataset randomly ordered so will get different classes
#(though maybe not all ten)
cifartrain = cifartot[:800]    #800x3073
cifartest = cifartot[801:1201] #400x3073
print(cifartest.shape)


(400, 3073)


In [15]:
#merge CIFAR10 images with TSR images

traindata = np.concatenate((cifartrain,tsrtrain)) #40009x3073
#print(traindata)
testdata = np.concatenate((cifartest,tsrtest)) #13030x3073
#print(testdata)

#convert numpy arrays to dataframe, add column name to labels
dftrain = pd.DataFrame(traindata,index=None,columns=None)
dftrain.rename(index=str, columns = {3072:"Labels"}, inplace=True)
print(dftrain.shape)
#dftrain.head()

dftest = pd.DataFrame(testdata,index=None,columns=None)
dftest.rename(index=str, columns = {3072:"Labels"}, inplace=True)
print(dftest.shape)
dftest.head()


(40009, 3073)
(13030, 3073)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3063,3064,3065,3066,3067,3068,3069,3070,3071,Labels
0,223,255,255,240,216,178,133,149,175,151,...,10,11,12,11,12,13,12,13,14,43
1,233,222,214,212,221,223,208,211,206,203,...,146,151,144,152,157,145,153,162,157,43
2,219,214,205,198,190,188,196,187,190,187,...,233,239,245,245,245,245,246,247,249,43
3,144,145,147,153,160,166,170,172,174,174,...,62,62,52,44,54,47,45,50,42,43
4,108,128,123,114,119,120,120,120,123,130,...,77,71,64,57,56,61,79,90,63,43


In [23]:
#only want class 1, 5, and 43 for the first attempt
smalltrain = dftrain.loc[(dftrain["Labels"]==1)|(dftrain["Labels"]==5)|(dftrain["Labels"]==43)]
smalltest = dftest.loc[(dftest["Labels"]==1)|(dftest["Labels"]==5)|(dftest["Labels"]==43)]
print(smalltrain.iloc[790:810])

#convert to 2D numpy array for use in classifiers
smalltrain = smalltrain.to_numpy()
smalltest = smalltest.to_numpy()
print(smalltrain.shape)
print(smalltest.shape)


        0    1    2    3    4    5    6    7    8    9  ...  3063  3064  3065  \
790   255  251  251  251  252  253  252  251  252  252  ...   253   253   253   
791   104   86  116  153  174  194  149  121  214  244  ...    91    94    95   
792    55   49   44   41   37   33   43   44   36   31  ...   109   155   155   
793   141  141  142  142  145  147  147  147  147  147  ...   151   150   148   
794   231  236  233  237  237  237  237  244  228  208  ...    74    73    79   
795   114  123  133  141  148  153  159  158  160  160  ...    74    72    68   
796    75   78   80   81   82   82   83   82   82   81  ...   236   237   237   
797   102  101  101  103  106  108  111  112  114  117  ...   211   105    29   
798    61   82  113  122  125  132  141  149  157  160  ...   136   138   135   
799    88   90   89   90   92   96  100   95   98  102  ...    46    47    51   
1010  255  255  255  235   94  106  102   87  127  123  ...    91    82   116   
1011   35   48  204  243   7

In [0]:
#store files in numpy array .npy extension
np.save('smalltrain',smalltrain)
np.save('smalltest',smalltest)

In [0]:
#Reference cell
#example of how to download files to local directory

from google.colab import files
#np.savetxt("traindata.csv", traindata, delimiter=",")
#files.download('traindata.csv')

files.download('smalltrain.npy')
files.download('smalltest.npy')

CIFAR10: 
This tech report (Chapter 3) describes the dataset and the methodology followed when collecting it in much greater detail. Please cite it if you intend to use this dataset. 

Learning Multiple Layers of Features from Tiny Images, Alex Krizhevsky, 2009.