# Neural Net Analysis Notebook
## W207 Final Project
### T. P. Goter
### July 6, 2019

This workbook is used to assess various models created as part of the Facial Keypoint Detection project for W207.

In [1]:
# Import the packages we need
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import numpy as np
import ipywidgets as widgets
from ipywidgets import interact, interact_manual, fixed
import os

In [2]:
# Load the pkled dataframe for the baseline single layer neural net
bl_sl_df = pd.read_pickle("OutputData/single_layer_df.pkl")
bl_sl_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate
236,2525.411497,2525.411865,2520.734895,2520.734863,236,50.253476,50.20692,0.990022,150,sigmoid,nadam,0.002
320,2525.411764,2525.411865,2520.735151,2520.735107,320,50.253476,50.206923,0.760812,200,tanh,adagrad,0.01
181,2525.411546,2525.411377,2520.734875,2520.734619,181,50.253471,50.206918,0.341651,100,sigmoid,adam,0.001
346,67.675726,67.675735,99.739942,99.739944,346,8.226526,9.986989,0.488164,200,relu,adam,0.001
292,2525.411545,2525.411133,2520.73486,2520.734619,292,50.253469,50.206918,0.431173,200,tanh,sgd,0.01
221,246.198381,246.19841,263.961196,263.961182,221,15.690711,16.246882,0.33751,50,relu,adagrad,0.01
246,2619.589251,2619.588867,2614.870264,2614.870117,246,51.181919,51.135801,0.292686,100,relu,sgd,0.01
347,2525.411532,2525.411377,2520.734826,2520.735107,347,50.253471,50.206923,0.800628,100,tanh,nadam,0.002
25,2525.427266,2525.427246,2520.750329,2520.750488,25,50.253629,50.207076,0.613186,150,sigmoid,adagrad,0.01
271,2525.411532,2525.410889,2520.734777,2520.734863,271,50.253466,50.20692,0.439407,50,sigmoid,nadam,0.002


In [3]:
# Create a plotting function to pass to the interact widget function
def plot_validation_loss(df=bl_sl_df, optimizer = bl_sl_df.optimizer.unique(), 
                    activation = bl_sl_df.activation.unique()):
    
    # Subset the baseline df by the specified optimizer and activation
    sub_df = df[df.optimizer.str.match(optimizer)]
    sub_df = sub_df[sub_df.activation.str.match(activation)]
     
    # Group the neural net data by optimizer and activation
    groups = sub_df.groupby(['hunits'])
    fig, axes = plt.subplots(1, 2, figsize=(15, 10))
    axes = axes.flatten()
    
    # Loop over the grouped data and plot out epoch timing and validation loss data
    for name, group in groups:
        axes[0].plot(group.epoch, group.val_RMSE, label=str(name)+' Validation Loss')
    #     axes[0].scatter(group.epoch, group.RMSE, label=' '.join(name)+' Training Loss')
        axes[1].scatter(group.epoch, group.times*1000, label=str(name)+' Fit Time')
        axes[0].set_xlabel('Epoch')
        axes[0].set_ylabel('Root Mean Square Error')
        axes[1].set_xlabel('Epoch')
        axes[1].set_ylabel('Fit Time (milliseconds)')
        axes[0].set_ylim([0,sub_df.val_RMSE.max()])
        axes[1].set_ylim([0,1000])
        axes[0].legend()
        axes[1].legend()
        axes[0].set_title("{} Optimizer and {} Activation".format(group.optimizer.unique(), group.activation.unique()))
    
    # Adjust the spacing of the subplots
    fig.subplots_adjust(left=0.03, right=0.97, hspace=0.1, wspace=0.15)

    # Add an overarching title for these plots
    fig.suptitle("Performance Comparison for Single Layer, Fully Connected Neural Nets",
                 fontsize=18, y=0.93)



In [14]:
interact_manual(plot_validation_loss, df=fixed(bl_sl_df), 
                optimizer = bl_sl_df.optimizer.unique(), 
                    activation = bl_sl_df.activation.unique())
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd', 'nadam', 'adagrad'), value='ad…




### Assessment of Baseline Results
1. Adam and Adagrad Optimizers are working well. 
2. Adam is faster and works well with 200 hidden units
3. Adagrad is slower buts works best with 100 hidden units.

In the evaluation above, both the hidden layer and the output layer used the activation function specified by the user. For the study below, the activation function of the output layer was set to softmax which is a multinomial classifier version of the sigmoid function. The plots below help to assess if the choice of activation function for the output layer significant alters are perception of which activation function and optimizers work well for our neural network.

In [16]:
# Load the pkled dataframe for the baseline single layer neural net
sm_sl_df = pd.read_pickle("OutputData/single_layer_softmax_df.pkl")
sm_sl_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate
224,2614.113149,2614.113037,2609.399151,2609.39917,224,51.128398,51.082278,0.375341,150,tanh,adam,0.001
193,2614.113185,2614.113281,2609.39915,2609.398926,193,51.1284,51.082276,0.196371,50,tanh,sgd,0.01
253,2614.113111,2614.113281,2609.399093,2609.398926,253,51.1284,51.082276,0.997336,150,sigmoid,nadam,0.002
284,2614.113245,2614.113281,2609.399322,2609.399414,284,51.1284,51.082281,0.21908,50,tanh,adagrad,0.01
30,2614.559659,2614.559326,2609.867712,2609.867676,30,51.132762,51.086864,0.50369,100,sigmoid,adagrad,0.01
370,2616.416937,2616.416748,2611.699415,2611.699463,370,51.150921,51.104789,0.388754,50,relu,nadam,0.002
132,2614.544216,2614.544189,2609.843707,2609.84375,132,51.132614,51.08663,0.391998,200,relu,sgd,0.01
336,2614.113138,2614.113281,2609.39908,2609.398926,336,51.1284,51.082276,0.22316,50,tanh,adam,0.001
371,2614.315085,2614.314941,2609.594211,2609.594238,371,51.130372,51.084188,0.203373,50,sigmoid,sgd,0.01
187,2614.113122,2614.113281,2609.399158,2609.39917,187,51.1284,51.082278,0.636158,150,relu,adagrad,0.01


In [17]:
# Plot the softmax data
interact_manual(plot_validation_loss, df=fixed(sm_sl_df), 
                optimizer = sm_sl_df.optimizer.unique(), 
                    activation = sm_sl_df.activation.unique())
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd', 'nadam', 'adagrad'), value='ad…




In [18]:
# Load the pkled dataframe for the baseline single layer neural net
relu_sl_df = pd.read_pickle("OutputData/single_layer_relu_df.pkl")
relu_sl_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate
141,57.55265,57.552654,56.939717,56.93972,141,7.586347,7.545841,0.356258,150,tanh,sgd,0.01
333,1127.115141,1127.115234,1125.635432,1125.635376,333,33.572537,33.55049,0.26773,50,tanh,adagrad,0.01
289,1155.966775,1155.967041,1150.548828,1150.548828,289,33.999515,33.919741,0.210961,50,sigmoid,adam,0.001
212,693.239986,693.240112,691.441554,691.441528,212,26.329453,26.295276,0.331476,150,sigmoid,sgd,0.01
325,454.136303,454.136292,453.087433,453.087433,325,21.310474,21.285851,0.479637,200,sigmoid,adam,0.001
338,1620.196116,1620.196045,1615.747777,1615.747803,338,40.251659,40.196366,0.416274,200,sigmoid,sgd,0.01
117,879.267893,879.267822,876.853924,876.853943,117,29.652451,29.61172,0.334154,100,sigmoid,sgd,0.01
399,285.918818,285.918884,286.812901,286.812927,399,16.909136,16.935552,0.390663,150,sigmoid,adam,0.001
75,759.176771,759.176758,755.215111,755.215088,75,27.553162,27.481177,0.599911,150,sigmoid,adagrad,0.01
273,502.154205,502.154205,500.816491,500.816437,273,22.408797,22.378928,0.604426,150,tanh,adagrad,0.01


In [20]:
# Plot the relu data
interact_manual(plot_validation_loss, df=fixed(relu_sl_df), 
                optimizer = relu_sl_df.optimizer.unique(), 
                    activation = relu_sl_df.activation.unique())
print()

interactive(children=(Dropdown(description='optimizer', options=('adam', 'sgd', 'nadam', 'adagrad'), value='ad…




In [4]:
relu_sl_df.groupby('optimizer').val_RMSE.min()

NameError: name 'relu_sl_df' is not defined

### Single Layer Assessment
Based on the three different experiments run, we will use a RELU final layer activation function. We will continue to assess adagrad and adam optimizers. We likely do not need to train for more than 200 epochs or so to get reasonably converged nets.

In [5]:
# Load the pkled dataframe for the baseline single layer neural net
relu_sl_lr_df = pd.read_pickle("OutputData/single_layer_relu_lr_df.pkl")
relu_sl_lr_df.sample(10)

Unnamed: 0,loss,mean_squared_error,val_loss,val_mean_squared_error,epoch,RMSE,val_RMSE,times,hunits,activation,optimizer,lrate
166,585.102651,585.1026,602.217882,602.217896,166,24.188894,24.540128,0.777923,200,relu,adagrad_02,0.02
197,101.179164,101.179169,121.990278,121.990288,197,10.058786,11.044921,0.195769,50,relu,adam_005,0.005
190,1382.132048,1382.13208,1389.458887,1389.458862,190,37.177037,37.275446,0.561554,150,relu,adagrad_005,0.05
184,84.662256,84.662247,116.007755,116.007736,184,9.201209,10.770689,0.490967,200,relu,adam_0005,0.0005
154,154.702597,154.702591,174.748838,174.748825,154,12.43795,13.21926,0.468345,200,relu,adam_005,0.005
121,293.815909,293.815887,292.408084,292.408112,121,17.141059,17.099945,0.259666,50,tanh,adagrad_02,0.02
62,360.313932,360.313965,370.613009,370.613007,62,18.981938,19.251312,0.271495,50,relu,adagrad_005,0.05
60,244.381297,244.381256,244.194078,244.194077,60,15.632698,15.62671,0.247615,50,tanh,adagrad_005,0.05
115,87.490673,87.490669,87.13756,87.137558,115,9.353645,9.33475,0.756261,200,tanh,adagrad_005,0.05
60,1393.413407,1393.413452,1396.197367,1396.197388,60,37.328454,37.365725,0.543673,150,relu,adagrad_005,0.05


In [6]:
# Plot the relu learning rate data
interact_manual(plot_validation_loss, df=fixed(relu_sl_lr_df), 
                optimizer = relu_sl_lr_df.optimizer.unique(), 
                    activation = relu_sl_lr_df.activation.unique())
print()

interactive(children=(Dropdown(description='optimizer', options=('adam_005', 'adam_0005', 'adagrad_02', 'adagr…




## Combine the dataframes to get a comprehensive look at the data


In [26]:
comb_bl_sl_df = pd.DataFrame()

In [None]:
# Load the pkled dataframe for the baseline single layer neural net
tl_df = pd.read_pickle("OutputData/two_layer_relu_df.pkll")
relu_sl_lr_df.sample(10)