# build a regression model using the deep learning Keras library, and predict the strength of different samples of concrete based on the volumes of the different ingredients that were used to make them.


## Import and Clean the dataset

### Import the pandas and numpy libraries

In [1]:
import pandas as pd
import numpy as np

### download the data and read it into a pandas dataframe

In [2]:
df = pd.read_csv('https://cocl.us/concrete_data')
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


### check the shape of the dataset

In [3]:
df.shape

(1030, 9)

### check the dataset for any missing values

In [4]:
df.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [5]:
df.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

based on above obervations, there is no missing value, and the data is pretty clearn

## Split data into predictors and target

### the column "Strength" is the targets, and other columns are the predictors

In [6]:
df_columns = df.columns # get the names of all columns

predictors = df[df_columns[df_columns != "Strength"]] # all columns except "Strength" column
target = df['Strength']  # Strength column

### Quick check the  predcitors dataframe

In [7]:
predictors.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


### Quick check the target dataframe

In [8]:
target.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

### save the number of predcitors to n_cols since we will need this number when we build the network

In [9]:
n_cols = predictors.shape[1]  # number of predictors

# Import Keras

In [10]:
import keras

Using TensorFlow backend.


### import the packages from Keras library

In [11]:
from keras.models import Sequential
from keras.layers import Dense

# Part A, build out baseline model

## Build a Neural Network

let's define a function that defines our regression model

In [12]:
# define the regression model

def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation = 'relu', input_shape=(n_cols, )))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer = 'adam', loss = 'mean_squared_error')
    
    return model

## Train and Test the Network

Let's call the function now to create out model

In [13]:
# build the model
model = regression_model()

Instructions for updating:
Colocations handled automatically by placer.


### randomly split 30% of data for testing by using train_test_split from sklearn library

In [14]:
from sklearn.model_selection import train_test_split   # import from sklearn 
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 42)

we will train and test the model at the same time using the fit method. we will leave 30% data 

for validation, and we will train the model for 50 epochs. Also we will run 30 times.

In [15]:
mean_squared_errors = []    # create a empty list 
n_time = 30                # number time to run the evaluation

for n in range(n_time):
    
    # train and test the model
    model.fit(X_train, y_train, validation_split=0.2, epochs = 50, verbose=2)
    
    # elaluate the model
    mse = model.evaluate(X_test, y_test, verbose=0)
    
    # append the mse to the mean_squared_error list
    mean_squared_errors.append(mse)

Instructions for updating:
Use tf.cast instead.
Train on 576 samples, validate on 145 samples
Epoch 1/50
 - 1s - loss: 2017.1483 - val_loss: 1148.0260
Epoch 2/50
 - 0s - loss: 906.9671 - val_loss: 607.1387
Epoch 3/50
 - 0s - loss: 597.3432 - val_loss: 554.1545
Epoch 4/50
 - 0s - loss: 527.9401 - val_loss: 512.8420
Epoch 5/50
 - 0s - loss: 486.0849 - val_loss: 480.4436
Epoch 6/50
 - 0s - loss: 453.2085 - val_loss: 449.2875
Epoch 7/50
 - 0s - loss: 427.9030 - val_loss: 421.4023
Epoch 8/50
 - 0s - loss: 407.1924 - val_loss: 401.2809
Epoch 9/50
 - 0s - loss: 386.7248 - val_loss: 383.3064
Epoch 10/50
 - 0s - loss: 370.5302 - val_loss: 369.3667
Epoch 11/50
 - 0s - loss: 356.7383 - val_loss: 356.6440
Epoch 12/50
 - 0s - loss: 345.1075 - val_loss: 345.3810
Epoch 13/50
 - 0s - loss: 334.8991 - val_loss: 333.7514
Epoch 14/50
 - 0s - loss: 325.8487 - val_loss: 325.1787
Epoch 15/50
 - 0s - loss: 318.5118 - val_loss: 317.8797
Epoch 16/50
 - 0s - loss: 311.6726 - val_loss: 311.1845
Epoch 17/50
 - 0s

## Report the mean and the standard deviation of the mean squared errors.

In [16]:
mean_squared_errors = pd.DataFrame(mean_squared_errors)   # covert list to dataframe
part_A_mean =mean_squared_errors.mean()[0]                # calculate mean of mean_squared_errors
part_A_std = mean_squared_errors.std()[0]                 # calculate standard deviation of mean_squared_errors
print("Mean of mean_squared_errors: " + str(part_A_mean))
print("Standard deviation of mean_squared_errors: " + str(part_A_std))

Mean of mean_squared_errors: 62.18566724906847
Standard deviation of mean_squared_errors: 32.94790840235276


# Part B: Repeat Part A but use a normalized version of the data. Recall that one way to normalize the data is by subtracting the mean from the individual predictors and dividing by the standard deviation.

### normalize the predictors dataset by substracting the mean and dividing by the standard deviation

In [17]:
predictors_norm = (predictors - predictors.mean()) /  predictors.std()

predictors_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


## train & test the network

call the regression_model function to create a model

In [18]:
# build the model
model_norm = regression_model()

### randomly split 30% of data for testing by using train_test_split from sklearn library

In [19]:
X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size = 0.3, random_state = 42)

### we use the fit method to train the model on the normalized data

In [20]:
mean_squared_errors_norm = []            # create a empty list 
n_time = 30                             # number time to run the evaluation

for n in range(n_time):
    
    # train and test the model
    model_norm.fit(X_train, y_train, validation_split=0.2, epochs = 50, verbose=2)
    
    # elaluate the model
    mse_norm = model_norm.evaluate(X_test, y_test, verbose=0)
    
    # append the mse to the mean_squared_error list
    mean_squared_errors_norm.append(mse_norm)

Train on 576 samples, validate on 145 samples
Epoch 1/50
 - 1s - loss: 1553.2969 - val_loss: 1580.9172
Epoch 2/50
 - 0s - loss: 1539.4560 - val_loss: 1567.3129
Epoch 3/50
 - 0s - loss: 1525.1529 - val_loss: 1553.5918
Epoch 4/50
 - 0s - loss: 1510.5815 - val_loss: 1539.3812
Epoch 5/50
 - 0s - loss: 1494.9634 - val_loss: 1525.2685
Epoch 6/50
 - 0s - loss: 1479.4253 - val_loss: 1510.2859
Epoch 7/50
 - 0s - loss: 1463.0318 - val_loss: 1494.5177
Epoch 8/50
 - 0s - loss: 1445.6224 - val_loss: 1478.3500
Epoch 9/50
 - 0s - loss: 1427.8446 - val_loss: 1461.5162
Epoch 10/50
 - 0s - loss: 1409.5137 - val_loss: 1443.8004
Epoch 11/50
 - 0s - loss: 1389.9930 - val_loss: 1425.6910
Epoch 12/50
 - 0s - loss: 1369.9697 - val_loss: 1406.7040
Epoch 13/50
 - 0s - loss: 1349.4163 - val_loss: 1386.8272
Epoch 14/50
 - 0s - loss: 1327.8828 - val_loss: 1366.3149
Epoch 15/50
 - 0s - loss: 1305.8014 - val_loss: 1345.3186
Epoch 16/50
 - 0s - loss: 1283.0410 - val_loss: 1323.8830
Epoch 17/50
 - 0s - loss: 1259.6882

## Report the mean and the standard deviation of the mean squared errors norm

In [21]:
mean_squared_errors_norm = pd.DataFrame(mean_squared_errors_norm)   # covert list to dataframe
part_B_mean = mean_squared_errors_norm.mean()[0]    # calculate mean of mean_squared_errors_norm
part_B_std = mean_squared_errors_norm.std()[0]      # calculate standard deviation of mean_squared_errors_norm
print("Mean of mean_squared_errors_norm: " + str(part_B_mean))
print("Standard deviation of mean_squared_errors_norm: " + str(part_B_std))

Mean of mean_squared_errors_norm: 73.24886618041168
Standard deviation of mean_squared_errors_norm: 84.02612068212252


## compare the Mean & Standard deviation from Part A

In [22]:
compared_A_B= pd.DataFrame([[part_A_mean, part_B_mean],
                              [part_A_std, part_B_std]],
                             columns = ['MSE', 'MSE_NORM'], index = ['MEAN', 'STD']
                            )
compared_A_B

Unnamed: 0,MSE,MSE_NORM
MEAN,62.185667,73.248866
STD,32.947908,84.026121


# Part C: Increate the number of epochs 

Repeat Part B but use 100 epochs this time for training.



In [23]:
# build the model
model_norm = regression_model()

# randomly split 30% of data for testing by using train_test_split from sklearn library
X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size = 0.3, random_state = 42)

mean_squared_errors_norm = []            # create a empty list 
n_time = 30                             # number time to run the evaluation

for n in range(n_time):
    
    # train and test the model
    model_norm.fit(X_train, y_train, validation_split=0.2, epochs = 100, verbose=2)
    
    # elaluate the model
    mse_norm = model_norm.evaluate(X_test, y_test, verbose=0)
    
    # append the mse to the mean_squared_error list
    mean_squared_errors_norm.append(mse_norm)


Train on 576 samples, validate on 145 samples
Epoch 1/100
 - 1s - loss: 1575.5328 - val_loss: 1595.9863
Epoch 2/100
 - 0s - loss: 1561.6621 - val_loss: 1582.3781
Epoch 3/100
 - 0s - loss: 1547.5663 - val_loss: 1568.3386
Epoch 4/100
 - 0s - loss: 1533.1584 - val_loss: 1553.9790
Epoch 5/100
 - 0s - loss: 1517.9498 - val_loss: 1539.3783
Epoch 6/100
 - 0s - loss: 1502.4442 - val_loss: 1524.2157
Epoch 7/100
 - 0s - loss: 1486.4355 - val_loss: 1508.3540
Epoch 8/100
 - 0s - loss: 1469.8012 - val_loss: 1491.8174
Epoch 9/100
 - 0s - loss: 1452.1642 - val_loss: 1475.0856
Epoch 10/100
 - 0s - loss: 1434.4823 - val_loss: 1457.2050
Epoch 11/100
 - 0s - loss: 1415.6144 - val_loss: 1438.8406
Epoch 12/100
 - 0s - loss: 1396.3271 - val_loss: 1419.6045
Epoch 13/100
 - 0s - loss: 1375.7986 - val_loss: 1400.2392
Epoch 14/100
 - 0s - loss: 1354.9526 - val_loss: 1379.9137
Epoch 15/100
 - 0s - loss: 1333.6854 - val_loss: 1358.5068
Epoch 16/100
 - 0s - loss: 1311.2344 - val_loss: 1336.8396
Epoch 17/100
 - 0s 

In [24]:
mean_squared_errors_norm = pd.DataFrame(mean_squared_errors_norm)   # covert list to dataframe
part_C_mean = mean_squared_errors_norm.mean()[0]    # calculate mean of mean_squared_errors_norm
part_C_std = mean_squared_errors_norm.std()[0]      # calculate standard deviation of mean_squared_errors_norm
print("Mean of mean_squared_errors_norm: " + str(part_C_mean))
print("Standard deviation of mean_squared_errors_norm: " + str(part_C_std))

Mean of mean_squared_errors_norm: 52.8914804884531
Standard deviation of mean_squared_errors_norm: 27.17430807753245


## compare the Mean & Standard deviation from Part B

In [25]:
compared_B_C= pd.DataFrame([[part_B_mean, part_C_mean],
                              [part_B_std, part_C_std]],
                             columns = ['Part B MSE_NORM', 'Part C MSE_NORM'], index = ['MEAN', 'STD']
                            )
compared_B_C

Unnamed: 0,Part B MSE_NORM,Part C MSE_NORM
MEAN,73.248866,52.89148
STD,84.026121,27.174308


# Part D: Increase the number of hidden layers

Repeat part B but use a neural network with the following instead:

- Three hidden layers, each of 10 nodes and ReLU activation function.



In [26]:
# define the regression model with 3 hidden layers

def regression_3layers_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation = 'relu', input_shape=(n_cols, )))
    model.add(Dense(10, activation = 'relu'))
    model.add(Dense(10, activation = 'relu'))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer = 'adam', loss = 'mean_squared_error')
    
    return model

In [27]:
# build the model
model_norm = regression_3layers_model()

# randomly split 30% of data for testing by using train_test_split from sklearn library
X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size = 0.3, random_state = 42)

mean_squared_errors_norm = []            # create a empty list 
n_time = 30                             # number time to run the evaluation

for n in range(n_time):
    
    # train and test the model
    model_norm.fit(X_train, y_train, validation_split=0.2, epochs = 50, verbose=2)
    
    # elaluate the model
    mse_norm = model_norm.evaluate(X_test, y_test, verbose=0)
    
    # append the mse to the mean_squared_error list
    mean_squared_errors_norm.append(mse_norm)
    
mean_squared_errors_norm = pd.DataFrame(mean_squared_errors_norm)   # covert list to dataframe
print("Mean of mean_squared_errors_norm: " + str(mean_squared_errors_norm.mean()[0]))
print("Standard deviation of mean_squared_errors_norm: " + str(mean_squared_errors_norm.std()[0]))# build the model


Train on 576 samples, validate on 145 samples
Epoch 1/50
 - 2s - loss: 1567.1068 - val_loss: 1584.0033
Epoch 2/50
 - 0s - loss: 1548.3079 - val_loss: 1562.8282
Epoch 3/50
 - 0s - loss: 1523.0022 - val_loss: 1532.6327
Epoch 4/50
 - 0s - loss: 1486.1611 - val_loss: 1488.8529
Epoch 5/50
 - 0s - loss: 1433.4165 - val_loss: 1425.4965
Epoch 6/50
 - 0s - loss: 1356.4152 - val_loss: 1334.0369
Epoch 7/50
 - 0s - loss: 1248.5854 - val_loss: 1205.0700
Epoch 8/50
 - 0s - loss: 1101.8613 - val_loss: 1036.4062
Epoch 9/50
 - 0s - loss: 915.5225 - val_loss: 836.2845
Epoch 10/50
 - 0s - loss: 710.1957 - val_loss: 619.1564
Epoch 11/50
 - 0s - loss: 506.7810 - val_loss: 426.9769
Epoch 12/50
 - 0s - loss: 351.3698 - val_loss: 290.5609
Epoch 13/50
 - 0s - loss: 266.0435 - val_loss: 219.2358
Epoch 14/50
 - 0s - loss: 235.1081 - val_loss: 192.1606
Epoch 15/50
 - 0s - loss: 222.7264 - val_loss: 183.4938
Epoch 16/50
 - 0s - loss: 214.0839 - val_loss: 178.6879
Epoch 17/50
 - 0s - loss: 207.5873 - val_loss: 177.

In [28]:
mean_squared_errors_norm = pd.DataFrame(mean_squared_errors_norm)   # covert list to dataframe
part_D_mean = mean_squared_errors_norm.mean()[0]    # calculate mean of mean_squared_errors_norm
part_D_std = mean_squared_errors_norm.std()[0]      # calculate standard deviation of mean_squared_errors_norm
print("Mean of mean_squared_errors_norm: " + str(part_D_mean))
print("Standard deviation of mean_squared_errors_norm: " + str(part_D_std))

Mean of mean_squared_errors_norm: 56.26549291703307
Standard deviation of mean_squared_errors_norm: 24.1725100049137


## compare the Mean & Standard deviation from Part C

In [30]:
compared_B_D= pd.DataFrame([[part_B_mean, part_D_mean],
                              [part_B_std, part_D_std]],
                             columns = ['Part C MSE_NORM', 'Part D MSE_NORM'], index = ['MEAN', 'STD']
                            )
compared_B_D

Unnamed: 0,Part C MSE_NORM,Part D MSE_NORM
MEAN,73.248866,56.265493
STD,84.026121,24.17251
