<a href="https://colab.research.google.com/github/toraaglobal/fashion-mnist/blob/master/02_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### MODEL

In [0]:
## import packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import time


## model
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier  ##min_samples_split, max_leaf_nodes, max_depth and min_samples_leaf.
from sklearn.ensemble import GradientBoostingClassifier  ## The learning_rate is a hyper-parameter in the range (0.0, 1.0] 
                                                        ##that controls overfitting 

from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDClassifier  ## clf = SGDClassifier(loss="hinge", penalty="l2", max_iter=5)
'''
SGDClassifier(alpha=0.0001, average=False, class_weight=None,
           early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
           l1_ratio=0.15, learning_rate='optimal', loss='hinge', max_iter=5,
           n_iter_no_change=5, n_jobs=None, penalty='l2', power_t=0.5,
           random_state=None, shuffle=True, tol=0.001,
           validation_fraction=0.1, verbose=0, warm_start=False)
'''



#import tensorflow as tf


from mlp import NeuralNetMLP
from mlp import MLPGradientCheck
import mlp

from sklearn.pipeline import Pipeline
import os
import struct

In [0]:
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.utils import np_utils

from kkeras import baseline_model

Using TensorFlow backend.


In [0]:
def load_mnist(path, kind='train'):
    """Load MNIST data from `path`"""
    labels_path = os.path.join(path, '%s-labels-idx1-ubyte' % kind)
    images_path = os.path.join(path, '%s-images-idx3-ubyte' % kind)
        
    with open(labels_path, 'rb') as lbpath:
        magic, n = struct.unpack('>II', lbpath.read(8))
        labels = np.fromfile(lbpath, dtype=np.uint8)

    with open(images_path, 'rb') as imgpath:
        magic, num, rows, cols = struct.unpack(">IIII", imgpath.read(16))
        images = np.fromfile(imgpath, dtype=np.uint8).reshape(len(labels), 784)
 
    return images, labels

## load traning set
X_train, y_train = load_mnist('./data/', kind='train')
print('Rows: %d, columns: %d' % (X_train.shape[0], X_train.shape[1]))

## load test set
X_test, y_test = load_mnist('./data/', kind='t10k')
print('Rows: %d, columns: %d' % (X_test.shape[0], X_test.shape[1]))

Rows: 60000, columns: 784
Rows: 10000, columns: 784


In [0]:
## create a model container and emty list to store training and prediction accuracy
model = {}
model_type = []
training_score = []
test_score = []
training_time = []
prediction_time = []

In [0]:

# SCRUB
# FLATTEN 28 x 28 IMAGE TO 784 VECTOR
num_pixels = X_train.shape[1]
#X_train = X_train.reshape(X_train.shape[0], num_pixels).astype('float32')
#X_test = X_test.reshape(X_test.shape[0], num_pixels).astype('float32')



# SCRUB
# NORMALIZE INPUTS FROM RGB COLOR TO 0-1
X_train = X_train / 255
X_test = X_test / 255


# SCRUB
# THE OLD ONE HOT ENCODE - CONVERT "CATEGORICAL" CLASSIFICATION TO ENCODE
# A "BINARIZATION" OF THE CATEGORIES
y_train_k = np_utils.to_categorical(y_train)
y_test_k = np_utils.to_categorical(y_test)
num_classes = 10

In [0]:

## initialize model and add model to container
model['GaussianNB'] = GaussianNB()
model['RandomForest'] = RandomForestClassifier()
model['GradientBoostingClassifier'] = GradientBoostingClassifier()
model['nnMLP'] = NeuralNetMLP(n_output=10, 
                  n_features=X_train.shape[1], 
                  n_hidden=50, 
                  l2=0.1, 
                  l1=0.0, 
                  epochs=200, 
                  eta=0.001,
                  alpha=0.001,
                  decrease_const=0.00001,
                  minibatches=50, 
                  shuffle=True,
                  random_state=1)


#model['MLPGradientCheck'] = MLPGradientCheck(n_output=10, 
#                            n_features=X_train.shape[1], 
#                            n_hidden=10, 
#                            l2=0.0, 
#                            l1=0.0, 
#                            epochs=100, 
#                            eta=0.001,
#                            alpha=0.0,
#                            decrease_const=0.0,
#                            minibatches=1, 
#                            shuffle=False,
#                            random_state=1)

model['keras_adam'] = baseline_model(num_pixels,num_classes, optimizer='adam',metrics=['accuracy'])

model['keras_sgd'] = baseline_model(num_pixels,num_classes, optimizer='sgd',metrics=['accuracy'])

model['keras_Adagrad'] = baseline_model(num_pixels,num_classes, optimizer='Adagrad',metrics=['accuracy'])

model['keras_Adadelta'] =baseline_model(num_pixels,num_classes, optimizer='Adadelta',metrics=['accuracy'])




Instructions for updating:
Colocations handled automatically by placer.


In [0]:
from sklearn.metrics import accuracy_score
process_start = time.time()
for mod in model:
    print("="* 50)
    print(mod)
    print("="* 50)

    ## start the model
    
    ## check non keras model
    if mod not in ('keras_adam','keras_sgd','keras_Adagrad','keras_Adadelta'):
        train_start = time.time()
        model[mod].fit(X_train,y_train)
        train_end = time.time()
    else:
        train_start = time.time()
        model[mod].fit(X_train,y_train_k, epochs=200)
        train_end = time.time()
        
    
    print("{} training time {}".format(mod,train_end-train_start))
    
    ## train score
    if mod not in ('keras_adam','keras_sgd','keras_Adagrad','keras_Adadelta'):
        train_prediction = model[mod].predict(X_train)
        train_score =  accuracy_score(train_prediction, y_train)
    else:
        train_prediction = model[mod].predict(X_train)
        train_score =  accuracy_score(train_prediction, y_train_k)
        
    print("{} training accuracy {}".format(mod, train_score))
    
    ## test prediction
    if mod not in ('keras_adam','keras_sgd','keras_Adagrad','keras_Adadelta'):
        predict_start = time.time()
        prediction = model[mod].predict(X_test)
        predict_end = time.time()
        score = accuracy_score(prediction, y_test)
    else:
        predict_start = time.time()
        prediction = model[mod].predict(X_test)
        predict_end = time.time()
        score = accuracy_score(prediction, y_test_k)
     
    ## score test
    print("{} time for testing {}".format(mod,predict_end-predict_start))
    
    print("{} test accuracy {}".format(mod, score))
    
    ## append all results
    model_type.append(mod)
    training_score.append(train_score)
    test_score.append(score)
    training_time.append(train_end - train_start)
    prediction_time.append(predict_end -  predict_start)
    print("Done with {}".format(mod))
    

process_end = time.time()      
## create a dataframe
result = {'Model': model_type, 'Training Accuracy': training_score, 'Training Time': training_time, "Test Accuracy": test_score,
          "Prediction Time": prediction_time}
df = pd.DataFrame(result)

print("Total Process Time: {}".format(process_end -  process_start))
df


GaussianNB
GaussianNB training time 1.705737590789795
GaussianNB training accuracy 0.5877833333333333
GaussianNB time for testing 1.774897813796997
GaussianNB test accuracy 0.5856
Done with GaussianNB
RandomForest




RandomForest training time 12.764865636825562
RandomForest training accuracy 0.99515
RandomForest time for testing 0.0937337875366211
RandomForest test accuracy 0.8546
Done with RandomForest
GradientBoostingClassifier
GradientBoostingClassifier training time 4212.935617446899
GradientBoostingClassifier training accuracy 0.90365
GradientBoostingClassifier time for testing 0.6565408706665039
GradientBoostingClassifier test accuracy 0.8681
Done with GradientBoostingClassifier
nnMLP
nnMLP training time 282.4245636463165
nnMLP training accuracy 0.90255
nnMLP time for testing 0.10955286026000977
nnMLP test accuracy 0.8688
Done with nnMLP
keras_adam
Instructions for updating:
Use tf.cast instead.
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 30/200
Epoch 36/200
Epoch 39/200
Epoch 42/200
Epoch 50/200
Epoch 53/200
Epoch 54/200
Epoch 62/200
Epoch 65/20

In [0]:
result = {'Model': model_type, 'Training Accuracy': training_score, 'Training Time': training_time, "Test Accuracy": test_score,
          "Prediction Time": prediction_time}
df = pd.DataFrame(result)

#print("Total Process Time: {}".format(process_end -  process_start))
df


In [0]:
result = {'Model': model_type, 'Training Accuracy': training_score, 'Training Time': training_time, "Test Accuracy": test_score,
          "Prediction Time": prediction_time}
df = pd.DataFrame(result)

#print("Total Process Time: {}".format(process_end -  process_start))
df


Unnamed: 0,Model,Prediction Time,Test Accuracy,Training Accuracy,Training Time
0,GaussianNB,1.774898,0.5856,0.587783,1.705738
1,RandomForest,0.093734,0.8546,0.99515,12.764866
2,GradientBoostingClassifier,0.656541,0.8681,0.90365,4212.935617
3,nnMLP,0.109553,0.8688,0.90255,282.424564
