# build a regression model using the deep learning Keras library, and predict the strength of different samples of concrete based on the volumes of the different ingredients that were used to make them.


## Import and Clean the dataset

### Import the pandas and numpy libraries

In [1]:
import pandas as pd
import numpy as np

### download the data and read it into a pandas dataframe

In [2]:
df = pd.read_csv('https://cocl.us/concrete_data')
df.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


### check the shape of the dataset

In [3]:
df.shape

(1030, 9)

### check the dataset for any missing values

In [4]:
df.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [5]:
df.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

based on above obervations, there is no missing value, and the data is pretty clearn

## Split data into predictors and target

### the column "Strength" is the targets, and other columns are the predictors

In [6]:
df_columns = df.columns # get the names of all columns

predictors = df[df_columns[df_columns != "Strength"]] # all columns except "Strength" column
target = df['Strength']  # Strength column

### Quick check the  predcitors dataframe

In [7]:
predictors.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360


### Quick check the target dataframe

In [8]:
target.head()

0    79.99
1    61.89
2    40.27
3    41.05
4    44.30
Name: Strength, dtype: float64

### save the number of predcitors to n_cols since we will need this number when we build the network

In [9]:
n_cols = predictors.shape[1]  # number of predictors

# Import Keras

In [10]:
import keras

Using TensorFlow backend.


### import the packages from Keras library

In [11]:
from keras.models import Sequential
from keras.layers import Dense

# Part A, build out baseline model

## Build a Neural Network

let's define a function that defines our regression model

In [12]:
# define the regression model

def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation = 'relu', input_shape=(n_cols, )))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer = 'adam', loss = 'mean_squared_error')
    
    return model

## Train and Test the Network

Let's call the function now to create out model

In [13]:
# build the model
model = regression_model()

Instructions for updating:
Colocations handled automatically by placer.


### randomly split 30% of data for testing by using train_test_split from sklearn library

In [14]:
from sklearn.model_selection import train_test_split   # import from sklearn 
X_train, X_test, y_train, y_test = train_test_split(predictors, target, test_size = 0.3, random_state = 42)

we will train and test the model at the same time using the fit method. we will leave 30% data 

for validation, and we will train the model for 50 epochs. Also we will run 30 times.

In [15]:
mean_squared_errors = []    # create a empty list 
n_time = 50                # number time to run the evaluation

for n in range(n_time):
    
    # train and test the model
    model.fit(X_train, y_train, validation_split=0.2, epochs = 50, verbose=2)
    
    # elaluate the model
    mse = model.evaluate(X_test, y_test, verbose=0)
    
    # append the mse to the mean_squared_error list
    mean_squared_errors.append(mse)

Instructions for updating:
Use tf.cast instead.
Train on 576 samples, validate on 145 samples
Epoch 1/50
 - 1s - loss: 100008.7856 - val_loss: 62138.4655
Epoch 2/50
 - 0s - loss: 35933.9257 - val_loss: 17296.6871
Epoch 3/50
 - 0s - loss: 8094.9854 - val_loss: 3685.0540
Epoch 4/50
 - 0s - loss: 2608.3276 - val_loss: 2400.7782
Epoch 5/50
 - 0s - loss: 2452.0204 - val_loss: 2259.2300
Epoch 6/50
 - 0s - loss: 2241.7357 - val_loss: 2136.5815
Epoch 7/50
 - 0s - loss: 2081.7587 - val_loss: 2009.7363
Epoch 8/50
 - 0s - loss: 1951.4785 - val_loss: 1873.7692
Epoch 9/50
 - 0s - loss: 1821.2129 - val_loss: 1724.5556
Epoch 10/50
 - 0s - loss: 1687.2051 - val_loss: 1595.6192
Epoch 11/50
 - 0s - loss: 1564.7221 - val_loss: 1479.1014
Epoch 12/50
 - 0s - loss: 1448.8877 - val_loss: 1373.8450
Epoch 13/50
 - 0s - loss: 1337.9296 - val_loss: 1248.3933
Epoch 14/50
 - 0s - loss: 1232.7540 - val_loss: 1160.3092
Epoch 15/50
 - 0s - loss: 1135.7892 - val_loss: 1069.8984
Epoch 16/50
 - 0s - loss: 1050.0586 - va

## Report the mean and the standard deviation of the mean squared errors.

In [16]:
mean_squared_errors = pd.DataFrame(mean_squared_errors)   # covert list to dataframe
part_A_mean =mean_squared_errors.mean()[0]                # calculate mean of mean_squared_errors
part_A_std = mean_squared_errors.std()[0]                 # calculate standard deviation of mean_squared_errors
print("Mean of mean_squared_errors: " + str(part_A_mean))
print("Standard deviation of mean_squared_errors: " + str(part_A_std))

Mean of mean_squared_errors: 60.66663203403015
Standard deviation of mean_squared_errors: 20.256626377183515


# Part B: Repeat Part A but use a normalized version of the data. Recall that one way to normalize the data is by subtracting the mean from the individual predictors and dividing by the standard deviation.

### normalize the predictors dataset by substracting the mean and dividing by the standard deviation

In [17]:
predictors_norm = (predictors - predictors.mean()) /  predictors.std()

predictors_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


## train & test the network

call the regression_model function to create a model

In [18]:
# build the model
model_norm = regression_model()

### randomly split 30% of data for testing by using train_test_split from sklearn library

In [19]:
X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size = 0.3, random_state = 42)

### we use the fit method to train the model on the normalized data

In [20]:
mean_squared_errors_norm = []            # create a empty list 
n_time = 50                             # number time to run the evaluation

for n in range(n_time):
    
    # train and test the model
    model_norm.fit(X_train, y_train, validation_split=0.2, epochs = 50, verbose=2)
    
    # elaluate the model
    mse_norm = model_norm.evaluate(X_test, y_test, verbose=0)
    
    # append the mse to the mean_squared_error list
    mean_squared_errors_norm.append(mse_norm)

Train on 576 samples, validate on 145 samples
Epoch 1/50
 - 1s - loss: 1558.8647 - val_loss: 1583.1554
Epoch 2/50
 - 0s - loss: 1547.0040 - val_loss: 1572.1394
Epoch 3/50
 - 0s - loss: 1534.9413 - val_loss: 1560.5756
Epoch 4/50
 - 0s - loss: 1522.3753 - val_loss: 1548.6276
Epoch 5/50
 - 0s - loss: 1508.9771 - val_loss: 1536.3195
Epoch 6/50
 - 0s - loss: 1495.3528 - val_loss: 1523.0235
Epoch 7/50
 - 0s - loss: 1480.6313 - val_loss: 1509.0590
Epoch 8/50
 - 0s - loss: 1465.1858 - val_loss: 1494.3418
Epoch 9/50
 - 0s - loss: 1448.8861 - val_loss: 1478.5672
Epoch 10/50
 - 0s - loss: 1431.3689 - val_loss: 1461.9640
Epoch 11/50
 - 0s - loss: 1413.1152 - val_loss: 1444.0393
Epoch 12/50
 - 0s - loss: 1393.7718 - val_loss: 1424.8575
Epoch 13/50
 - 0s - loss: 1372.9389 - val_loss: 1405.1686
Epoch 14/50
 - 0s - loss: 1351.4914 - val_loss: 1384.0534
Epoch 15/50
 - 0s - loss: 1329.2010 - val_loss: 1361.5650
Epoch 16/50
 - 0s - loss: 1304.8615 - val_loss: 1339.3158
Epoch 17/50
 - 0s - loss: 1280.9163

## Report the mean and the standard deviation of the mean squared errors norm

In [21]:
mean_squared_errors_norm = pd.DataFrame(mean_squared_errors_norm)   # covert list to dataframe
part_B_mean = mean_squared_errors_norm.mean()[0]    # calculate mean of mean_squared_errors_norm
part_B_std = mean_squared_errors_norm.std()[0]      # calculate standard deviation of mean_squared_errors_norm
print("Mean of mean_squared_errors_norm: " + str(part_B_mean))
print("Standard deviation of mean_squared_errors_norm: " + str(part_B_std))

Mean of mean_squared_errors_norm: 60.99763819463046
Standard deviation of mean_squared_errors_norm: 58.874688030759046


## compare the Mean & Standard deviation from Part A

In [22]:
compared_A_B= pd.DataFrame([[part_A_mean, part_B_mean],
                              [part_A_std, part_B_std]],
                             columns = ['MSE', 'MSE_NORM'], index = ['MEAN', 'STD']
                            )
compared_A_B

Unnamed: 0,MSE,MSE_NORM
MEAN,60.666632,60.997638
STD,20.256626,58.874688


# Part C: Increate the number of epochs 

Repeat Part B but use 100 epochs this time for training.



In [23]:
# build the model
model_norm = regression_model()

# randomly split 30% of data for testing by using train_test_split from sklearn library
X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size = 0.3, random_state = 42)

mean_squared_errors_norm = []            # create a empty list 
n_time = 50                             # number time to run the evaluation

for n in range(n_time):
    
    # train and test the model
    model_norm.fit(X_train, y_train, validation_split=0.2, epochs = 100, verbose=2)
    
    # elaluate the model
    mse_norm = model_norm.evaluate(X_test, y_test, verbose=0)
    
    # append the mse to the mean_squared_error list
    mean_squared_errors_norm.append(mse_norm)


Train on 576 samples, validate on 145 samples
Epoch 1/100
 - 1s - loss: 1556.1067 - val_loss: 1578.0889
Epoch 2/100
 - 0s - loss: 1542.9645 - val_loss: 1565.7085
Epoch 3/100
 - 0s - loss: 1529.1806 - val_loss: 1553.2822
Epoch 4/100
 - 0s - loss: 1515.4261 - val_loss: 1540.1859
Epoch 5/100
 - 0s - loss: 1500.7193 - val_loss: 1526.6395
Epoch 6/100
 - 0s - loss: 1485.6747 - val_loss: 1512.2440
Epoch 7/100
 - 0s - loss: 1469.7343 - val_loss: 1497.1703
Epoch 8/100
 - 0s - loss: 1453.2475 - val_loss: 1481.2349
Epoch 9/100
 - 0s - loss: 1435.5844 - val_loss: 1464.8998
Epoch 10/100
 - 0s - loss: 1417.7356 - val_loss: 1447.3815
Epoch 11/100
 - 0s - loss: 1398.9024 - val_loss: 1429.1732
Epoch 12/100
 - 0s - loss: 1379.0929 - val_loss: 1410.3506
Epoch 13/100
 - 0s - loss: 1358.7081 - val_loss: 1390.7805
Epoch 14/100
 - 0s - loss: 1337.7630 - val_loss: 1370.3844
Epoch 15/100
 - 0s - loss: 1315.8793 - val_loss: 1349.2613
Epoch 16/100
 - 0s - loss: 1293.2316 - val_loss: 1327.7349
Epoch 17/100
 - 0s 

In [24]:
mean_squared_errors_norm = pd.DataFrame(mean_squared_errors_norm)   # covert list to dataframe
part_C_mean = mean_squared_errors_norm.mean()[0]    # calculate mean of mean_squared_errors_norm
part_C_std = mean_squared_errors_norm.std()[0]      # calculate standard deviation of mean_squared_errors_norm
print("Mean of mean_squared_errors_norm: " + str(part_C_mean))
print("Standard deviation of mean_squared_errors_norm: " + str(part_C_std))

Mean of mean_squared_errors_norm: 76.80322246316953
Standard deviation of mean_squared_errors_norm: 15.302778934473867


## compare the Mean & Standard deviation from Part B

In [25]:
compared_B_C= pd.DataFrame([[part_B_mean, part_C_mean],
                              [part_B_std, part_C_std]],
                             columns = ['Part B MSE_NORM', 'Part C MSE_NORM'], index = ['MEAN', 'STD']
                            )
compared_B_C

Unnamed: 0,Part B MSE_NORM,Part C MSE_NORM
MEAN,60.997638,76.803222
STD,58.874688,15.302779


# Part D: Increase the number of hidden layers

Repeat part B but use a neural network with the following instead:

- Three hidden layers, each of 10 nodes and ReLU activation function.



In [26]:
# define the regression model with 3 hidden layers

def regression_3layers_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation = 'relu', input_shape=(n_cols, )))
    model.add(Dense(10, activation = 'relu'))
    model.add(Dense(10, activation = 'relu'))
    model.add(Dense(1))
    
    # compile model
    model.compile(optimizer = 'adam', loss = 'mean_squared_error')
    
    return model

In [27]:
# build the model
model_norm = regression_3layers_model()

# randomly split 30% of data for testing by using train_test_split from sklearn library
X_train, X_test, y_train, y_test = train_test_split(predictors_norm, target, test_size = 0.3, random_state = 42)

mean_squared_errors_norm = []            # create a empty list 
n_time = 50                             # number time to run the evaluation

for n in range(n_time):
    
    # train and test the model
    model_norm.fit(X_train, y_train, validation_split=0.2, epochs = 50, verbose=2)
    
    # elaluate the model
    mse_norm = model_norm.evaluate(X_test, y_test, verbose=0)
    
    # append the mse to the mean_squared_error list
    mean_squared_errors_norm.append(mse_norm)
    

Train on 576 samples, validate on 145 samples
Epoch 1/50
 - 2s - loss: 1575.2955 - val_loss: 1592.6651
Epoch 2/50
 - 0s - loss: 1557.1868 - val_loss: 1573.9132
Epoch 3/50
 - 0s - loss: 1533.3042 - val_loss: 1547.4208
Epoch 4/50
 - 0s - loss: 1499.9604 - val_loss: 1508.4719
Epoch 5/50
 - 0s - loss: 1452.0379 - val_loss: 1454.5430
Epoch 6/50
 - 0s - loss: 1385.4274 - val_loss: 1383.4680
Epoch 7/50
 - 0s - loss: 1300.0391 - val_loss: 1291.2487
Epoch 8/50
 - 0s - loss: 1189.9909 - val_loss: 1176.9211
Epoch 9/50
 - 0s - loss: 1054.5581 - val_loss: 1035.2877
Epoch 10/50
 - 0s - loss: 897.0149 - val_loss: 867.3553
Epoch 11/50
 - 0s - loss: 719.2979 - val_loss: 698.7594
Epoch 12/50
 - 0s - loss: 562.2221 - val_loss: 535.4438
Epoch 13/50
 - 0s - loss: 437.8321 - val_loss: 410.0129
Epoch 14/50
 - 0s - loss: 358.7159 - val_loss: 329.8179
Epoch 15/50
 - 0s - loss: 310.7192 - val_loss: 284.9649
Epoch 16/50
 - 0s - loss: 281.7707 - val_loss: 254.8622
Epoch 17/50
 - 0s - loss: 261.3715 - val_loss: 23

In [28]:
mean_squared_errors_norm = pd.DataFrame(mean_squared_errors_norm)   # covert list to dataframe
part_D_mean = mean_squared_errors_norm.mean()[0]    # calculate mean of mean_squared_errors_norm
part_D_std = mean_squared_errors_norm.std()[0]      # calculate standard deviation of mean_squared_errors_norm
print("Mean of mean_squared_errors_norm: " + str(part_D_mean))
print("Standard deviation of mean_squared_errors_norm: " + str(part_D_std))

Mean of mean_squared_errors_norm: 39.68893144021142
Standard deviation of mean_squared_errors_norm: 14.49566641134437


## compare the Mean & Standard deviation from Part C

In [31]:
compared_B_D= pd.DataFrame([[part_B_mean, part_D_mean],
                              [part_B_std, part_D_std]],
                             columns = ['Part B MSE_NORM', 'Part D MSE_NORM'], index = ['MEAN', 'STD']
                            )
compared_B_D

Unnamed: 0,Part B MSE_NORM,Part D MSE_NORM
MEAN,60.997638,39.688931
STD,58.874688,14.495666
