In [0]:
import keras
import numpy as np
import pandas as pd
from keras.datasets import cifar10
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers.core import Dense, Flatten,Activation
from keras.layers.convolutional import Conv2D
from keras.optimizers import Adam
from keras.layers.pooling import MaxPooling2D
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from datetime import timedelta
import time
from keras.preprocessing.image import ImageDataGenerator
from keras.layers import Dropout, BatchNormalization
from keras import regularizers
import os
np.random.seed(666)

In [0]:
# Get n randomly selected indices from the data 
def getTrainingDataIndicesOfSize(data, size):
    chosen_indices = np.random.choice(np.arange(len(data)), size, replace=False)
    return chosen_indices

In [0]:
# Simple function to get a new sample from the given class
def getNewDataOfClass(original_data, class_id):
    chosen_index = np.random.choice(np.arange(len(original_data)), 1, replace=False)[0]
    while(original_data[chosen_index][0] != class_id):
        chosen_index = np.random.choice(np.arange(len(original_data)), 1, replace=False)[0]
    return chosen_index

In [0]:
# Add new samples of each class until the sample-size-per-class target is reached
def fillLowSampleCountClassesTo(original_data, indices, target_count):
    unique, counts = np.unique(original_data[indices], return_counts=True)
    combined_stats = np.asarray((unique, counts)).T
    for i in range(len(combined_stats)):
        samples_needed = target_count - combined_stats[i][1]
        while(samples_needed > 0):
            potential_index = getNewDataOfClass(original_data, combined_stats[i][0])
            if not (np.any(indices[:] == potential_index)):
                indices = np.append(indices,potential_index)
                samples_needed -= 1
    return indices

In [0]:
# Remove samples of each class until the sample-size-per-class target is reached
def deleteSamplesOverPerClassLimit(original_data, indices, target_count):
    unique, counts = np.unique(original_data[indices], return_counts=True)
    combined_stats = np.asarray((unique, counts)).T
    for i in range(len(combined_stats)):
        samples_to_delete = combined_stats[i][1] - target_count
        while(samples_to_delete > 0):
            potential_index = np.random.choice(np.arange(len(indices)), 1, replace=False)[0]
            if (original_data[indices[potential_index]] == combined_stats[i][0]):
                indices = np.delete(indices,potential_index)
                samples_to_delete -= 1
    return indices

In [0]:
# Get exactly n samples of each class randomly
# 1. step: get n_class times sample_per_class indices (this gives approximately uniform distribution)
# 2-3. step: correct sample-per-class differences by removing excess samples and adding to underrepresented classes
def getRandomSamplesOfEachClassOfSize(data, n_classes, sample_per_class):
    indices = getTrainingDataIndicesOfSize(data, n_classes*sample_per_class)
    indices = fillLowSampleCountClassesTo(data, indices, sample_per_class)
    indices = deleteSamplesOverPerClassLimit(data, indices, sample_per_class)
  
  
    return indices


In [0]:
def classification_report_dataframe(report, index=0):
    report_data = []
    lines = report.split('\n')
    row = {}

    for line in lines[2:-5]:
        row_data = line.split('      ')
        class_idx = str(int(row_data[1]))
        row[class_idx + 'precision'] = float(row_data[2])
        row[class_idx + 'recall'] = float(row_data[3])
        row[class_idx + 'f1_score'] = float(row_data[4])
        #row[class_idx + 'support'] = float(row_data[5])
    
    dataframe = pd.DataFrame(data=row, index=[index])
    
    
    #dataframe.to_csv('classification_report.csv', index = False)
    return dataframe

In [0]:
#An example for a regularization method built in
def getNewDropoutModel():
    model = Sequential()
    model.add(Conv2D(32, (3,3), padding='same', input_shape=(32, 32, 3)))
    model.add(Activation('relu'))
    model.add(Dropout(0.1))
    model.add(Conv2D(32, (3,3), padding='same'))
    model.add(Activation('relu'))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.2))

    model.add(Conv2D(64, (3,3), padding='same'))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    model.add(Conv2D(64, (3,3), padding='same'))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.25))

    model.add(Conv2D(128, (3,3), padding='same'))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(Conv2D(128, (3,3), padding='same'))
    model.add(Activation('relu'))
    model.add(Dropout(0.5))
    model.add(MaxPooling2D(pool_size=(2,2)))
    model.add(Dropout(0.5))

    model.add(Flatten())
    model.add(Dense(10, activation='softmax'))
    model.compile(loss='categorical_crossentropy',
                  optimizer=Adam(lr=0.0004, decay=1e-4),
                  metrics=['accuracy'])
    return model
   

In [0]:
#Run 10 iterations of the model containg dropout, save statistics
n_classes = 10
n_iters = 10

sample_counts = [50,100,200,400,700,1000,1500,2000,3000,4000,5000]
(X_train, Y_train), (X_test, Y_test) = cifar10.load_data()

for i in range(len(sample_counts)):
    print("-- Training models for sample size: ", sample_counts[i])
    test_scores =  pd.DataFrame()
    for iteration in range(n_iters):
        print("Iteration: ", iteration)
        sample_size = sample_counts[i]
        model = getNewDropoutModel()
        indices = getRandomSamplesOfEachClassOfSize(Y_train, n_classes, sample_size)
        x_train = X_train[indices]
        y_train = Y_train[indices]
        x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, shuffle= True)
        
        modelpath="models/" + str(sample_size) + "/" + str(iteration) + "-dropout_model2-{epoch:02d}-acc-{val_acc:.4f}-loss-{val_loss:.4f}.h5"
        checkpoint = ModelCheckpoint(modelpath, monitor='val_loss', verbose=0, save_best_only=True, save_weights_only=True, mode='auto', period=1)

        train_logpath = "logs/" + "/training/" + str(sample_size) + "/" + str(iteration) + "-dropout_model.csv"
        train_logger = CSVLogger(train_logpath, append=False, separator=';')
                    

        early_stopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0,restore_best_weights=True)
        
        history = model.fit(x_train / 255.0, to_categorical(y_train),
                  batch_size=128,
                  shuffle=True,
                  epochs=200,
                  validation_data=(x_valid / 255.0, to_categorical(y_valid)),
                  callbacks=[keras.callbacks.History(), checkpoint, train_logger, early_stopping],
                           verbose = 0)
        
        score = model.evaluate(X_test / 255.0, to_categorical(Y_test), batch_size=128, verbose=0)
        score = dict(zip(model.metrics_names, score))
        score_df = pd.DataFrame(data=score, index=[0])
        y_pred = model.predict_classes(X_test)
        report = classification_report(Y_test, y_pred)
        report_df = classification_report_dataframe(report)
        test_scores = test_scores.append(pd.concat([score_df, report_df], axis=1))
        
    test_logpath = "logs/" + "/test/" + str(sample_size) + "/"+ "dropout_model.csv"
    test_scores.to_csv(test_logpath)
    print("%.2f%% (+/- %.2f%%)" % (np.mean(test_scores['acc']), np.std(test_scores['acc'])))