In [1]:
# Name: Zhihao Zhang
# NetID: zz2432

%matplotlib inline
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras

## Extension 3: Implement softmax as outer layer activation in Neural Network

### Build one-hidden layer neural network similar to what we implemented in the homework

* Input layer: 64
* Hidden layer: 30
* Output layer: 10
* Activation function: relu
* adam optimizer: a stochastic gradient descent method that is based on adaptive estimation of first-order and second-order moments.
* loss function: use CategoricalCrossentropy for one-hot encoding labels

For model_2, we simply change the outer layer activation function to softmax for implementing the extension


#### Parameters for fitting the model:
* validation_split=0.1   split input training set such that 90% for training, 10% for validation
* batch_size=10          number of samples to work through before updating the internal model parameters 
* epochs=20              number of iterations
* shuffle=True           whether to shuffle the training data before each epoch
* verbose=0              Verbosity mode. 0 = silent, 1 = progress bar, 2 = one line per epoch

### 1. Dataset used in class: load_digits

In [2]:
from sklearn.datasets import load_digits # The MNIST data set is in scikit learn data set
digits=load_digits()
X_scale = preprocessing.StandardScaler()  # It is important in neural networks to scale the data
X = X_scale.fit_transform(digits.data)
y = digits.target
#Split the data into training and test set.  70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# one-hot encoding y_train
n_labels = np.unique(y_train).size
y_train = tf.one_hot(y_train, depth=n_labels)

In [3]:
print(f'Input size: {X_train.shape[1]}\nOutput size: {n_labels}')

Input size: 64
Output size: 10


#### NN model similiar to our hw: All relu activation functions

In [4]:
# build NN model
model_1 = keras.Sequential([
    keras.layers.Dense(units=64, input_shape=(64,),activation='relu'),
    keras.layers.Dense(units=30, activation='relu'),
    keras.layers.Dense(units=10, activation='relu')
])
# compile our model with adam optimizer
model_1.compile(optimizer='adam', 
              loss=tf.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [5]:
history = model_1.fit(X_train, y_train, validation_split=0.1,batch_size=10,epochs=20,shuffle=True,verbose=0)

In [6]:
predictions = model_1.predict(X_test, batch_size=10,verbose=0)
print('Peak at first example prediction:')
predictions[0].tolist()

Peak at first example prediction:


[0.055045414716005325,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 11.059178352355957,
 0.0,
 2.7189788818359375,
 0.0]

In [7]:
# if your model does binary classification(e.g. if it uses a `sigmoid` last-layer activation)
# (model.predict(x) > 0.5).astype("int32")

preds = np.argmax(predictions, axis=-1)
res = np.where(preds==y_test)[0].size/y_test.size
print(f'Simple NN model Accuracy: {res}')

Simple NN model Accuracy: 0.8722222222222222


#### NN model with extension: use softmax activation at the output layer

In [8]:
# build NN model
model_2 = keras.Sequential([
    keras.layers.Dense(units=64, input_shape=(64,),activation='relu'),
    keras.layers.Dense(units=30, activation='relu'),
    keras.layers.Dense(units=10, activation='softmax')
])
# compile our model with adam optimizer
model_2.compile(optimizer='adam', 
              loss=tf.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [9]:
history = model_2.fit(X_train, y_train ,validation_split=0.1,batch_size=10,epochs=20, shuffle=True,verbose=0)

In [10]:
# return the probability for each of 10 labels, the sum of all is 1
predictions = model_2.predict(X_test, batch_size=10,verbose=0)
print('Peak at first example prediction:')
predictions[0].tolist()

Peak at first example prediction:


[3.463188249952509e-06,
 2.7399980240261357e-07,
 2.340833304970147e-08,
 1.625615908551481e-08,
 4.0496192923455965e-06,
 1.0581742770909841e-07,
 0.9999899864196777,
 2.0254979915534932e-08,
 2.19072694562783e-06,
 6.45689612888134e-10]

In [11]:
# if your model does multi-class classification (e.g. if it uses a `softmax` last-layer activation)
# np.argmax(model.predict(x), axis=-1)
preds = np.argmax(predictions, axis=-1)
res = np.where(preds==y_test)[0].size/y_test.size
print(f'Simple NN model with softmax as extension Accuracy: {res}')

Simple NN model with softmax as extension Accuracy: 0.975925925925926


### 2. Dataset outside class:  fetch_covtype

In [12]:
from sklearn.datasets import fetch_covtype
cover_type = fetch_covtype()
X_scale = preprocessing.StandardScaler()  # It is important in neural networks to scale the data
X = X_scale.fit_transform(cover_type.data)
y = cover_type.target
#Split the data into training and test set.  70% training and 30% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# since our label is from 1 to 7: for convenience, subtract all label by 1 to match the indices of one-hot encoding
y_train = y_train - 1
# one-hot encoding y_train
n_labels = np.unique(y_train).size
y_train = tf.one_hot(y_train, depth=n_labels)

In [13]:
print(f'Number of Sample to Train: {X_train.shape[0]}\nInput size: {X_train.shape[1]}\nOutput size: {n_labels}')

Number of Sample to Train: 406708
Input size: 54
Output size: 7


### Now build two NN models (same configurations as above) to test for our extension. 

#### NN model 

In [14]:
# build NN model layers: [54, 30, 7]. our imput size is 54, our output label size is 7
model_3 = keras.Sequential([
    keras.layers.Dense(units=54, input_shape=(54,),activation='relu'),
    keras.layers.Dense(units=30, activation='relu'),
    keras.layers.Dense(units=7, activation='relu')
])
# compile our model with adam optimizer
model_3.compile(optimizer='adam', 
              loss=tf.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [15]:
%%time
history = model_3.fit(X_train, y_train, validation_split=0.1,batch_size=128,epochs=10,shuffle=True,verbose=0)

CPU times: user 30.8 s, sys: 13.1 s, total: 43.9 s
Wall time: 15.2 s


In [16]:
predictions = model_3.predict(X_test, batch_size=1000,verbose=0)

preds = np.argmax(predictions, axis=-1)
# convert prediction back to original labeling
preds = preds + 1
res = np.where(preds==y_test)[0].size/y_test.size
print(f'Simple NN model Accuracy: {res}')

Simple NN model Accuracy: 0.8259248210023866


#### NN model with softmax extension

In [17]:
# build NN model layers: [54, 30, 7]. our imput size is 54, our output label size is 7
model_4 = keras.Sequential([
    keras.layers.Dense(units=54, input_shape=(54,),activation='relu'),
    keras.layers.Dense(units=30, activation='relu'),
    keras.layers.Dense(units=7, activation='softmax')
])
# compile our model with adam optimizer
model_4.compile(optimizer='adam', 
              loss=tf.losses.CategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])

In [18]:
%%time
history = model_4.fit(X_train, y_train, validation_split=0.1,batch_size=128,epochs=10,shuffle=True,verbose=0)

CPU times: user 31.4 s, sys: 13.1 s, total: 44.6 s
Wall time: 15.8 s


In [19]:
predictions = model_4.predict(X_test, batch_size=1000,verbose=0)

preds = np.argmax(predictions, axis=-1)
# convert prediction back to original labeling
preds = preds + 1
res = np.where(preds==y_test)[0].size/y_test.size
print(f'Simple NN model with softmax as extension Accuracy: {res}')

Simple NN model with softmax as extension Accuracy: 0.7711641729392326
