### Regression Model with Keras

### Download and Clean Dataset

In [1]:
#Download and clean dataset
import pandas as pd
import numpy as np
concrete_data = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')
concrete_data.head()

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20191225133339-0000
KERNEL_ID = 7aee22d4-4927-47a5-9d7f-dcec5221d9a9


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
concrete_data.shape

(1030, 9)

In [4]:
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [13]:
#Split data into predicators X and target y

concrete_data_columns = concrete_data.columns

X = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
y = concrete_data['Strength'] # Strength column

In [15]:
#The last step is to normalize the data by substracting the mean and dividing by the standard deviation.
X_norm = (X - X.mean()) / X.std()
X_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


In [38]:

#1 Randomly split the data into a training and test sets by holding 30% of the data for testing. You can use the train_test_split helper function from Scikit-learn.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, train_size=0.70,test_size=0.30)
print ("X_train: ", X_train)
print ("y_train: ", y_train)

X_train:          Cement  Blast Furnace Slag   Fly Ash     Water  Superplasticizer  \
351  -0.647500           -0.856472  1.875270 -1.262855          0.919901   
804   1.070099           -0.856472 -0.846733  0.488555         -1.038638   
146   1.376300            0.375573 -0.846733 -1.314367          1.723404   
649  -1.094363            2.043411 -0.846733  1.027091         -1.038638   
471   1.577245           -0.578306  0.387700 -0.916319          0.903161   
574  -1.391952            1.502146 -0.846733  0.193532         -1.038638   
363  -0.602527           -0.223644  1.087733 -1.909097          0.953380   
792   0.649072           -0.856472 -0.846733  0.488555         -1.038638   
813   0.275889           -0.856472 -0.846733  0.488555         -1.038638   
487   1.012686           -0.624667  0.622086 -1.150465          0.903161   
613  -0.039881           -0.856472 -0.846733  0.441726         -1.038638   
672  -1.035036            0.487998 -0.846733  0.488555         -1.038638   
10

### Build Neural Network

In [46]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error

#three  hidden layers, each of 10 nodes, and a ReLU activation function
#Use the adam optimizer and the mean squared error as the loss function.
n_cols = X_train.shape[1] # number of predictors
# define regression model
def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    
    # compile model
    model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
    return model

In [47]:
#2 Train the model on the training data using 50 epochs.
# build the model
model = regression_model()
# fit the model
model.fit(X_train, y_train, epochs=50, verbose=2)

Epoch 1/50
 - 1s - loss: 1600.5983 - mean_squared_error: 1600.5983
Epoch 2/50
 - 0s - loss: 1582.2750 - mean_squared_error: 1582.2750
Epoch 3/50
 - 0s - loss: 1561.0390 - mean_squared_error: 1561.0390
Epoch 4/50
 - 0s - loss: 1531.4615 - mean_squared_error: 1531.4615
Epoch 5/50
 - 0s - loss: 1489.8436 - mean_squared_error: 1489.8436
Epoch 6/50
 - 0s - loss: 1430.8264 - mean_squared_error: 1430.8264
Epoch 7/50
 - 0s - loss: 1346.9038 - mean_squared_error: 1346.9038
Epoch 8/50
 - 0s - loss: 1229.3031 - mean_squared_error: 1229.3031
Epoch 9/50
 - 0s - loss: 1075.5778 - mean_squared_error: 1075.5778
Epoch 10/50
 - 0s - loss: 891.1986 - mean_squared_error: 891.1986
Epoch 11/50
 - 0s - loss: 694.4176 - mean_squared_error: 694.4176
Epoch 12/50
 - 0s - loss: 522.3003 - mean_squared_error: 522.3003
Epoch 13/50
 - 0s - loss: 400.8604 - mean_squared_error: 400.8604
Epoch 14/50
 - 0s - loss: 329.7928 - mean_squared_error: 329.7928
Epoch 15/50
 - 0s - loss: 291.9382 - mean_squared_error: 291.9382
E

<keras.callbacks.History at 0x7f74583b6ef0>

In [48]:
#3 Evaluate the model on the test data and compute the mean squared error between the predicted concrete strength and the actual concrete strength. 
#You can use the mean_squared_error function from Scikit-learn.

#Report the mean and the standard deviation of the mean squared errors.
pred_train= model.predict(X_train)
print(mean_squared_error(y_train,pred_train))

pred= model.predict(X_test)
print(mean_squared_error(y_test,pred)) 

133.2762379593066
135.90169366774606


In [49]:
#4. Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.
listmse=[]; 
for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(X_norm, y, train_size=0.70,test_size=0.30)
    model.fit(X_train, y_train, epochs=50, verbose=2)
    pred= model.predict(X_test)
    listmse.append(mean_squared_error(y_test,pred)) 
    


Epoch 1/50
 - 0s - loss: 136.0398 - mean_squared_error: 136.0398
Epoch 2/50
 - 0s - loss: 134.7874 - mean_squared_error: 134.7874
Epoch 3/50
 - 0s - loss: 134.0035 - mean_squared_error: 134.0035
Epoch 4/50
 - 0s - loss: 133.3673 - mean_squared_error: 133.3673
Epoch 5/50
 - 0s - loss: 132.7798 - mean_squared_error: 132.7798
Epoch 6/50
 - 0s - loss: 132.1391 - mean_squared_error: 132.1391
Epoch 7/50
 - 0s - loss: 131.5492 - mean_squared_error: 131.5492
Epoch 8/50
 - 0s - loss: 131.3403 - mean_squared_error: 131.3403
Epoch 9/50
 - 0s - loss: 130.5640 - mean_squared_error: 130.5640
Epoch 10/50
 - 0s - loss: 130.2486 - mean_squared_error: 130.2486
Epoch 11/50
 - 0s - loss: 129.7222 - mean_squared_error: 129.7222
Epoch 12/50
 - 0s - loss: 129.2509 - mean_squared_error: 129.2509
Epoch 13/50
 - 0s - loss: 128.9025 - mean_squared_error: 128.9025
Epoch 14/50
 - 0s - loss: 128.4463 - mean_squared_error: 128.4463
Epoch 15/50
 - 0s - loss: 128.0290 - mean_squared_error: 128.0290
Epoch 16/50
 - 0s -

In [50]:
#5. Report the mean and the standard deviation of the mean squared errors.
print("With Normalization, 3 hidden layers, 50 epochs the Mean of the mean_squared_error as:", np.mean(listmse)) 
print("With Normalization, 3 hidden layers 50 epochs the Std of the mean_squared_error as:", np.std(listmse)) 

With Normalization, 3 hidden layers, 50 epochs the Mean of the mean_squared_error as: 29.500678047103555
With Normalization, 3 hidden layers 50 epochs the Std of the mean_squared_error as: 16.308767253621145
