### Regression Model with Keras

### Download and Clean Dataset

In [1]:
#Download and clean dataset
import pandas as pd
import numpy as np
concrete_data = pd.read_csv('https://s3-api.us-geo.objectstorage.softlayer.net/cf-courses-data/CognitiveClass/DL0101EN/labs/data/concrete_data.csv')
concrete_data.head()

Waiting for a Spark session to start...
Spark Initialization Done! ApplicationId = app-20191225133339-0000
KERNEL_ID = 7aee22d4-4927-47a5-9d7f-dcec5221d9a9


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [3]:
concrete_data.shape

(1030, 9)

In [4]:
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [13]:
#Split data into predicators X and target y

concrete_data_columns = concrete_data.columns

X = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] # all columns except Strength
y = concrete_data['Strength'] # Strength column

In [15]:
#The last step is to normalize the data by substracting the mean and dividing by the standard deviation.
X_norm = (X - X.mean()) / X.std()
X_norm.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079,-0.279597
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079,-0.279597
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,3.55134
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829,5.055221
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569,4.976069


In [31]:

#1 Randomly split the data into a training and test sets by holding 30% of the data for testing. You can use the train_test_split helper function from Scikit-learn.
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70,test_size=0.30)
print ("X_train: ", X_train)
print ("y_train: ", y_train)

X_train:        Cement  Blast Furnace Slag  Fly Ash  Water  Superplasticizer  \
562    382.5                 0.0      0.0  185.7               0.0   
727    331.0                 0.0      0.0  192.0               0.0   
360    218.2                54.6    123.8  140.8              11.9   
1029   260.9               100.5     78.3  200.6               8.6   
971    312.7               144.7      0.0  127.3               8.0   
304    277.1                 0.0     97.4  160.6              11.8   
608    236.0                 0.0      0.0  194.0               0.0   
391    397.0                17.2    158.0  167.0              20.8   
768    331.0                 0.0      0.0  192.0               0.0   
46     349.0                 0.0      0.0  192.0               0.0   
642    250.0                 0.0      0.0  182.0               0.0   
77     425.0               106.3      0.0  153.5              16.5   
702    200.0               133.0      0.0  192.0               0.0   
14     304

### Build Neural Network

In [33]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.metrics import mean_squared_error

#One hidden layer of 10 nodes, and a ReLU activation function
#Use the adam optimizer and the mean squared error as the loss function.
n_cols = X_train.shape[1] # number of predictors
# define regression model
def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
    model.add(Dense(1))
    
    # compile model
    model.compile(loss= "mean_squared_error" , optimizer="adam", metrics=["mean_squared_error"])
    return model

In [34]:
#2 Train the model on the training data using 50 epochs.
# build the model
model = regression_model()
# fit the model
model.fit(X_train, y_train, epochs=50, verbose=2)

Epoch 1/50
 - 0s - loss: 38313.2327 - mean_squared_error: 38313.2327
Epoch 2/50
 - 0s - loss: 7012.7297 - mean_squared_error: 7012.7297
Epoch 3/50
 - 0s - loss: 3199.6365 - mean_squared_error: 3199.6365
Epoch 4/50
 - 0s - loss: 2892.8436 - mean_squared_error: 2892.8436
Epoch 5/50
 - 0s - loss: 2612.7445 - mean_squared_error: 2612.7445
Epoch 6/50
 - 0s - loss: 2367.8781 - mean_squared_error: 2367.8781
Epoch 7/50
 - 0s - loss: 2152.6098 - mean_squared_error: 2152.6098
Epoch 8/50
 - 0s - loss: 1928.7960 - mean_squared_error: 1928.7960
Epoch 9/50
 - 0s - loss: 1735.7630 - mean_squared_error: 1735.7630
Epoch 10/50
 - 0s - loss: 1566.1677 - mean_squared_error: 1566.1677
Epoch 11/50
 - 0s - loss: 1420.4395 - mean_squared_error: 1420.4395
Epoch 12/50
 - 0s - loss: 1297.7123 - mean_squared_error: 1297.7123
Epoch 13/50
 - 0s - loss: 1192.4699 - mean_squared_error: 1192.4699
Epoch 14/50
 - 0s - loss: 1098.9918 - mean_squared_error: 1098.9918
Epoch 15/50
 - 0s - loss: 1016.6620 - mean_squared_erro

<keras.callbacks.History at 0x7f74b024b278>

In [35]:
#3 Evaluate the model on the test data and compute the mean squared error between the predicted concrete strength and the actual concrete strength. 
#You can use the mean_squared_error function from Scikit-learn.

#Report the mean and the standard deviation of the mean squared errors.
pred_train= model.predict(X_train)
print(mean_squared_error(y_train,pred_train))

pred= model.predict(X_test)
print(mean_squared_error(y_test,pred)) 

249.35565587244966
272.07566738543954


In [36]:
#4. Repeat steps 1 - 3, 50 times, i.e., create a list of 50 mean squared errors.
listmse=[]; 
for i in range(50):
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.70,test_size=0.30)
    model.fit(X_train, y_train, epochs=50, verbose=2)
    pred= model.predict(X_test)
    listmse.append(mean_squared_error(y_test,pred)) 
    


Epoch 1/50
 - 0s - loss: 260.6619 - mean_squared_error: 260.6619
Epoch 2/50
 - 0s - loss: 258.2684 - mean_squared_error: 258.2684
Epoch 3/50
 - 0s - loss: 248.9411 - mean_squared_error: 248.9411
Epoch 4/50
 - 0s - loss: 245.5798 - mean_squared_error: 245.5798
Epoch 5/50
 - 0s - loss: 239.8627 - mean_squared_error: 239.8627
Epoch 6/50
 - 0s - loss: 234.2816 - mean_squared_error: 234.2816
Epoch 7/50
 - 0s - loss: 229.2884 - mean_squared_error: 229.2884
Epoch 8/50
 - 0s - loss: 225.4288 - mean_squared_error: 225.4288
Epoch 9/50
 - 0s - loss: 220.7177 - mean_squared_error: 220.7177
Epoch 10/50
 - 0s - loss: 217.7282 - mean_squared_error: 217.7282
Epoch 11/50
 - 0s - loss: 215.0181 - mean_squared_error: 215.0181
Epoch 12/50
 - 0s - loss: 209.4711 - mean_squared_error: 209.4711
Epoch 13/50
 - 0s - loss: 206.3645 - mean_squared_error: 206.3645
Epoch 14/50
 - 0s - loss: 201.8934 - mean_squared_error: 201.8934
Epoch 15/50
 - 0s - loss: 199.2757 - mean_squared_error: 199.2757
Epoch 16/50
 - 0s -

In [37]:
#5. Report the mean and the standard deviation of the mean squared errors.
print("Without Normalization the Mean of the mean_squared_error as:", np.mean(listmse)) 
print("Without Normalization the Std of the mean_squared_error as:", np.std(listmse)) 

Without Normalization the Mean of the mean_squared_error as: 109.90043559623143
Without Normalization the Std of the mean_squared_error as: 20.24164115301615
