### Regression Model with Keras

### Download and Clean Dataset

In [1]:
#Download and clean dataset
import pandas as pd
import numpy as np
concrete_data = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')
concrete_data.head()

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20191225133339-0000
KERNEL_ID = 7aee22d4-4927-47a5-9d7f-dcec5221d9a9


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
concrete_data.shape

(1030, 9)

In [4]:
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [13]:
#Split data into predicators X and target y

concrete_data_columns = concrete_data.columns

X = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
y = concrete_data['Strength'] # Strength column

In [15]:
#The last step is to normalize the data by substracting the mean and dividing by the standard deviation.
X_norm = (X - X.mean()) / X.std()
X_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


In [38]:

#1 Randomly split the data into a training and test sets by holding 30% of the data for testing. You can use the train_test_split helper function from Scikit-learn.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_norm, y, train_size=0.70,test_size=0.30)
print ("X_train: ", X_train)
print ("y_train: ", y_train)

X_train:          Cement  Blast Furnace Slag   Fly Ash     Water  Superplasticizer  \
351  -0.647500           -0.856472  1.875270 -1.262855          0.919901   
804   1.070099           -0.856472 -0.846733  0.488555         -1.038638   
146   1.376300            0.375573 -0.846733 -1.314367          1.723404   
649  -1.094363            2.043411 -0.846733  1.027091         -1.038638   
471   1.577245           -0.578306  0.387700 -0.916319          0.903161   
574  -1.391952            1.502146 -0.846733  0.193532         -1.038638   
363  -0.602527           -0.223644  1.087733 -1.909097          0.953380   
792   0.649072           -0.856472 -0.846733  0.488555         -1.038638   
813   0.275889           -0.856472 -0.846733  0.488555         -1.038638   
487   1.012686           -0.624667  0.622086 -1.150465          0.903161   
613  -0.039881           -0.856472 -0.846733  0.441726         -1.038638   
672  -1.035036            0.487998 -0.846733  0.488555         -1.038638   
10

### Build Neural Network

In [40]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error

#One hidden layer of 10 nodes, and a ReLU activation function
#Use the adam optimizer and the mean squared error as the loss function.
n_cols = X_train.shape[1] # number of predictors
# define regression model
def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    
    # compile model
    model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
    return model

In [41]:
#2 Train the model on the training data using 100 epochs.
# build the model
model = regression_model()
# fit the model
model.fit(X_train, y_train, epochs=100, verbose=2)

Epoch 1/100
 - 0s - loss: 1523.5880 - mean_squared_error: 1523.5880
Epoch 2/100
 - 0s - loss: 1505.5317 - mean_squared_error: 1505.5317
Epoch 3/100
 - 0s - loss: 1486.9780 - mean_squared_error: 1486.9780
Epoch 4/100
 - 0s - loss: 1468.3091 - mean_squared_error: 1468.3091
Epoch 5/100
 - 0s - loss: 1448.9059 - mean_squared_error: 1448.9059
Epoch 6/100
 - 0s - loss: 1428.6293 - mean_squared_error: 1428.6293
Epoch 7/100
 - 0s - loss: 1408.1208 - mean_squared_error: 1408.1208
Epoch 8/100
 - 0s - loss: 1386.3207 - mean_squared_error: 1386.3207
Epoch 9/100
 - 0s - loss: 1363.8695 - mean_squared_error: 1363.8695
Epoch 10/100
 - 0s - loss: 1340.2761 - mean_squared_error: 1340.2761
Epoch 11/100
 - 0s - loss: 1315.5394 - mean_squared_error: 1315.5394
Epoch 12/100
 - 0s - loss: 1289.5880 - mean_squared_error: 1289.5880
Epoch 13/100
 - 0s - loss: 1262.0290 - mean_squared_error: 1262.0290
Epoch 14/100
 - 0s - loss: 1233.4994 - mean_squared_error: 1233.4994
Epoch 15/100
 - 0s - loss: 1203.1089 - mean

<keras.callbacks.History at 0x7f74a82f37f0>

In [42]:
#3 Evaluate the model on the test data and compute the mean squared error between the predicted concrete strength and the actual concrete strength. 
#You can use the mean_squared_error function from Scikit-learn.

#Report the mean and the standard deviation of the mean squared errors.
pred_train= model.predict(X_train)
print(mean_squared_error(y_train,pred_train))

pred= model.predict(X_test)
print(mean_squared_error(y_test,pred)) 

159.44357702307192
155.74792978837849


In [43]:
#4. Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.
listmse=[]; 
for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(X_norm, y, train_size=0.70,test_size=0.30)
    model.fit(X_train, y_train, epochs=100, verbose=2)
    pred= model.predict(X_test)
    listmse.append(mean_squared_error(y_test,pred)) 
    


Epoch 1/100
 - 0s - loss: 160.7253 - mean_squared_error: 160.7253
Epoch 2/100
 - 0s - loss: 159.7407 - mean_squared_error: 159.7407
Epoch 3/100
 - 0s - loss: 158.8684 - mean_squared_error: 158.8684
Epoch 4/100
 - 0s - loss: 158.1009 - mean_squared_error: 158.1009
Epoch 5/100
 - 0s - loss: 157.2479 - mean_squared_error: 157.2479
Epoch 6/100
 - 0s - loss: 156.4636 - mean_squared_error: 156.4636
Epoch 7/100
 - 0s - loss: 155.7652 - mean_squared_error: 155.7652
Epoch 8/100
 - 0s - loss: 155.0042 - mean_squared_error: 155.0042
Epoch 9/100
 - 0s - loss: 154.2645 - mean_squared_error: 154.2645
Epoch 10/100
 - 0s - loss: 153.5628 - mean_squared_error: 153.5628
Epoch 11/100
 - 0s - loss: 152.7453 - mean_squared_error: 152.7453
Epoch 12/100
 - 0s - loss: 152.0987 - mean_squared_error: 152.0987
Epoch 13/100
 - 0s - loss: 151.3235 - mean_squared_error: 151.3235
Epoch 14/100
 - 0s - loss: 150.6268 - mean_squared_error: 150.6268
Epoch 15/100
 - 0s - loss: 149.8609 - mean_squared_error: 149.8609
Epoc

In [44]:
#5. Report the mean and the standard deviation of the mean squared errors.
print("With Normalization with 100 epochs the Mean of the mean_squared_error as:", np.mean(listmse)) 
print("With Normalization with 100 epochs the Std of the mean_squared_error as:", np.std(listmse)) 

With Normalization with 100 epochs the Mean of the mean_squared_error as: 37.45378720672578
With Normalization with 100 epochs the Std of the mean_squared_error as: 8.242183264632228
