# Peer-graded Assignment: Build a Regression Model in Keras

# Download and Clean Dataset

Let's start by importing the pandas and the Numpy libraries.

In [1]:
import pandas as pd
import numpy as np

We will be using the dataset provided in the assignment

The dataset is about the compressive strength of different samples of concrete based on the volumes of the different ingredients that were used to make them. Ingredients include:

1. Cement

2. Blast Furnace Slag

3. Fly Ash

4. Water

5. Superplasticizer

6. Coarse Aggregate

7. Fine Aggregate

Let's read the dataset into a pandas dataframe.

In [2]:
concrete_data = pd.read_csv('https://cocl.us/concrete_data')
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
concrete_data.shape

(1030, 9)

In [4]:
concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [5]:
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

Split data into predictors and target

In [6]:
concrete_data_columns = concrete_data.columns
predictors = concrete_data[concrete_data_columns[concrete_data_columns != "Strength"]] # all columns except Strength
target = concrete_data['Strength']

In [7]:
predictors.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


In [8]:
target.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

normalize the data by subtracting the mean and dividing by the standard deviation. 

In [9]:
predictors_norm = (predictors - predictors.mean() / predictors.std())
predictors_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,537.309562,-0.856472,-0.846733,153.497358,1.461362,1027.48721,666.351468,27.277154
1,537.309562,-0.856472,-0.846733,153.497358,1.461362,1042.48721,666.351468,27.277154
2,329.809562,141.643528,-0.846733,219.497358,-1.038638,919.48721,584.351468,269.277154
3,329.809562,141.643528,-0.846733,219.497358,-1.038638,919.48721,584.351468,364.277154
4,195.909562,131.543528,-0.846733,183.497358,-1.038638,965.88721,815.851468,359.277154


In [10]:
n_cols = predictors_norm.shape[1] # number of predictors
n_cols

8

# Import Keras

In [11]:
import keras

import packages

In [12]:
from keras.models import Sequential
from keras.layers import Dense

In [13]:
# define regression model
def regression_model():
#     create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    
#     compile model
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

The above function creates a model that has three hidden layers each with 10 neurons and a ReLU activation function. It uses the adam optimizer and the mean squared error as the loss function.

Split the data into a training and test sets by holding 30% of the data for testing

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3, random_state=42)

# Train and Test the Network

In [16]:
# build the model 
model = regression_model()

train the model for 50 epochs

In [17]:
# fit the model
epochs = 50
model.fit(X_train, y_train, epochs=epochs, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x22b9e088a00>

evaluate the model on the test data

In [18]:
loss_val = model.evaluate(X_test, y_test)
y_pred = model.predict(X_test)
loss_val



98.25908660888672

compute the mean squared error between the predicted concrete strength and the actual concrete strength.

import the mean_squared_error function from Scikit-learn.

In [19]:
from sklearn.metrics import mean_squared_error

In [20]:
mean_square_error = mean_squared_error(y_test, y_pred)
mean = np.mean(mean_square_error)
standard_deviation = np.std(mean_square_error)
print(mean, standard_deviation)

98.2590917932672 0.0


create a list of 50 mean squared errors and report mean and the standard deviation of the mean squared errors.

In [21]:
total_mean_squared_errors = 50
epochs = 50
mean_squared_errors = []
for i in range(0, total_mean_squared_errors):
    X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size=0.3, random_state=i)
    model.fit(X_train, y_train, epochs=epochs, verbose=0)
    MSE = model.evaluate(X_test, y_test, verbose=0)
    print("MSE " + str(i+1) + ": " + str(MSE))
    y_pred = model.predict(X_test)
    mean_square_error = mean_squared_error(y_test, y_pred)
    mean_squared_errors.append(mean_square_error)
    
mean_squared_errors = np.array(mean_squared_errors)
mean = np.mean(mean_squared_errors)
standard_deviation = np.std(mean_squared_errors)

print('\n')
print("Below is the mean and standard deviation of " +str(total_mean_squared_errors) + " mean squared errors with normalized data. Total number of epochs for each training is: " +str(epochs) + "\n")
print("Mean: "+str(mean))
print("Standard Deviation: "+str(standard_deviation))
    

MSE 1: 66.28936004638672
MSE 2: 57.791622161865234
MSE 3: 46.493804931640625
MSE 4: 52.914886474609375
MSE 5: 47.30628967285156
MSE 6: 49.03553009033203
MSE 7: 60.71516036987305
MSE 8: 47.66618347167969
MSE 9: 43.844146728515625
MSE 10: 45.93512725830078
MSE 11: 39.63486862182617
MSE 12: 39.81464385986328
MSE 13: 47.558895111083984
MSE 14: 60.66805648803711
MSE 15: 44.92026138305664
MSE 16: 43.12477111816406
MSE 17: 40.7577018737793
MSE 18: 41.535213470458984
MSE 19: 40.873756408691406
MSE 20: 41.1195068359375
MSE 21: 38.20732498168945
MSE 22: 39.16508102416992
MSE 23: 33.70100402832031
MSE 24: 41.31883239746094
MSE 25: 40.661094665527344
MSE 26: 41.92958068847656
MSE 27: 39.298606872558594
MSE 28: 45.551998138427734
MSE 29: 41.86195755004883
MSE 30: 37.17485046386719
MSE 31: 41.800933837890625
MSE 32: 32.65897750854492
MSE 33: 38.50413131713867
MSE 34: 36.78211212158203
MSE 35: 36.893882751464844
MSE 36: 41.13296890258789
MSE 37: 41.84742736816406
MSE 38: 40.785072326660156
MSE 39: 34