<h1>Introduction to Deep Learning & Neural Networks with Keras</h1>

<h2>Peer-graded Assignment: Build a Regression Model in Keras</h2>

<h3>Submission by Xander Mol</h3>

<b>Imports</b>

In [1]:
#Import statements
import pandas as pd
import numpy as np
import keras
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.


<b>Load, examine and prepare dataset</b>

In [2]:
#Load CSV set provided into a Pandas dataframe
concrete_data = pd.read_csv('https://cocl.us/concrete_data')

#Show first lines of result
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
#Show amount of data points
concrete_data.shape

(1030, 9)

In [4]:
#Show dataset statistics
concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [5]:
#Check for missing values
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [6]:
#Split data into predictors and target
concrete_data_columns = concrete_data.columns

predictors = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
target = concrete_data['Strength'] # Strength column

predictors.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [7]:
target.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

In [9]:
n_cols = predictors.shape[1] # number of predictors

<h3>Part A: Build a baseline model</h3>

<b>Building and testing the model</b>

In [None]:
#Define regression model given the requirement parameters:
#- One hidden layer of 10 nodes, and a ReLU activation function
#- The adam optimizer and the mean squared error as the loss function

def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

#Set loop for building baseline model 50 times with 50 random train/test splits

error = []

for x in range(50):
    
    #Create a train/test split using SciKitLearn function
    X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size=0.3, random_state=42)
    
    #Build the model
    model = regression_model()

    #Fit the model
    model.fit(X_train, y_train, epochs=50, verbose=0)
    
    #Test the model
    y_pred = model.predict(X_test)
    error.append(mean_squared_error(y_test, y_pred))
    print('The Mean Squared Error on run {} is {}.'.format(x,error[x]))

The Mean Squared Error on run 0 is 225.5356953961527.
The Mean Squared Error on run 1 is 1204.9560395849971.
The Mean Squared Error on run 2 is 108.68482459646715.
The Mean Squared Error on run 3 is 136.1898811660813.
The Mean Squared Error on run 4 is 107.90880011091596.
The Mean Squared Error on run 5 is 109.12526976427216.
The Mean Squared Error on run 6 is 435.79826563596396.
The Mean Squared Error on run 7 is 2370.1762507068943.
The Mean Squared Error on run 8 is 112.83877814068245.
The Mean Squared Error on run 9 is 315.0753115279621.
The Mean Squared Error on run 10 is 121.51233638920523.
The Mean Squared Error on run 11 is 126.5800951768932.


<b>Calculate mean and standard deviations of errors</b>

In [None]:
print('The mean of the calculated Mean Squared Errors is {}'.format(np.mean(error)))
print('The standard deviation of the calculated Mean Squared Errors is {}'.format(np.std(error)))

<h3>Part B: Normalize the data</h3>

<b>Normalize the data</b>

In [None]:
#normalize the data by substracting the mean and dividing by the standard deviation.
predictors_norm = (predictors - predictors.mean()) / predictors.std()
predictors_norm.head()

<b>Repeat steps of part A, now with the normalized data.</b>

In [None]:
#Define regression model given the requirement parameters:
#- One hidden layer of 10 nodes, and a ReLU activation function
#- The adam optimizer and the mean squared error as the loss function

def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

#Set loop for building baseline model 50 times with 50 random train/test splits

errorB = []

for x in range(50):
    
    #Create a train/test split using SciKitLearn function
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3, random_state=42)
    
    #Build the model
    model = regression_model()

    #Fit the model
    model.fit(X_train, y_train, epochs=50, verbose=0)
    
    #Test the model
    y_pred = model.predict(X_test)
    errorB.append(mean_squared_error(y_test, y_pred))
    print('The Mean Squared Error on run {} is {}.'.format(x,errorB[x]))
    
print('The mean of the calculated Mean Squared Errors is {}'.format(np.mean(errorB)))
print('The standard deviation of the calculated Mean Squared Errors is {}'.format(np.std(errorB)))

<h3>Part C: Increate the number of epochs</h3> 

<b>Repeat steps of part B, now with 100 epochs instead of 50.</b>

In [None]:
#Define regression model given the requirement parameters:
#- One hidden layer of 10 nodes, and a ReLU activation function
#- The adam optimizer and the mean squared error as the loss function

def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

#Set loop for building baseline model 50 times with 50 random train/test splits

errorC = []

for x in range(50):
    
    #Create a train/test split using SciKitLearn function
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3, random_state=42)
    
    #Build the model
    model = regression_model()

    #Fit the model
    model.fit(X_train, y_train, epochs=100, verbose=0)
    
    #Test the model
    y_pred = model.predict(X_test)
    errorC.append(mean_squared_error(y_test, y_pred))
    print('The Mean Squared Error on run {} is {}.'.format(x,errorC[x]))
    
print('The mean of the calculated Mean Squared Errors is {}'.format(np.mean(errorC)))
print('The standard deviation of the calculated Mean Squared Errors is {}'.format(np.std(errorC)))

<h3>Part D: Increase the number of hidden layers</h3>

<b>Repeat steps of part B, now with three hidden layers, each of 10 nodes and ReLU activation function</b>

In [None]:
#Define regression model given the requirement parameters

def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

#Set loop for building baseline model 50 times with 50 random train/test splits

errorD = []

for x in range(50):
    
    #Create a train/test split using SciKitLearn function
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3, random_state=42)
    
    #Build the model
    model = regression_model()

    #Fit the model
    model.fit(X_train, y_train, epochs=50, verbose=0)
    
    #Test the model
    y_pred = model.predict(X_test)
    errorD.append(mean_squared_error(y_test, y_pred))
    print('The Mean Squared Error on run {} is {}.'.format(x,errorD[x]))
    
print('The mean of the calculated Mean Squared Errors is {}'.format(np.mean(errorD)))
print('The standard deviation of the calculated Mean Squared Errors is {}'.format(np.std(errorD)))