# Building a Regression Model in Keras

## A- Building a Baseline Model

Loading csv file and loading it into pandas DataFrame

In [1]:
import pandas as pd, numpy as np #importing pandas & numpy

concrete_data = pd.read_csv('https://cocl.us/concrete_data') #loading CSV file from web into pandas dataframe
concrete_data.head() # display the first 5 rows

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [2]:
print ("Data shape =", concrete_data.shape) # checking dataframe size (rows, columns)
concrete_data.describe() #columns statstics 

Data shape = (1030, 9)


Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [3]:
concrete_data.isnull().sum() #checking for missing values

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

Defining predictors and target:
* X: predictors are 7 columns as defined in instruction
* y: target is strength

In [4]:
y = concrete_data[['Strength']]
X = concrete_data.drop(columns =['Age','Strength']) #dropping age as it's not required as predictor and Strength as its target variable

In [5]:
X.head() #checking predictors dataframe

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5


In [6]:
y.head() #checking target column

Unnamed: 0,Strength
0,79.99
1,61.89
2,40.27
3,41.05
4,44.3


### A - 1: Splitting data into train and test sets

In [7]:
# splitting data into train and test sets (30% testing as per instructions)
from sklearn.model_selection import train_test_split #importing train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [8]:
print('X_train shape = {}, X_test shape = {}'.format(X_train.shape, X_test.shape))

X_train shape = (721, 7), X_test shape = (309, 7)


### A - 2: Model Training

In [9]:
# importing keras packages
import keras
from keras.models import Sequential
from keras.layers import Dense

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [10]:
# define regression model function with one hidden layer with 10 nodes, 7 input predictors, one output layer
def regression_model():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(7,)))
    model.add(Dense(1))
    
    # compile model with adam optimizer and MSE
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [11]:
# build the model
model = regression_model()

In [12]:
# fit the model with 50 epochs
model.fit(X_train, y_train, epochs=50, verbose = 2)

Epoch 1/50
 - 1s - loss: 26929.2705
Epoch 2/50
 - 0s - loss: 3505.7875
Epoch 3/50
 - 0s - loss: 2009.3052
Epoch 4/50
 - 0s - loss: 1701.8130
Epoch 5/50
 - 0s - loss: 1474.6250
Epoch 6/50
 - 0s - loss: 1285.3518
Epoch 7/50
 - 0s - loss: 1122.2309
Epoch 8/50
 - 0s - loss: 987.5764
Epoch 9/50
 - 0s - loss: 854.5217
Epoch 10/50
 - 0s - loss: 754.1663
Epoch 11/50
 - 0s - loss: 662.1493
Epoch 12/50
 - 0s - loss: 589.3714
Epoch 13/50
 - 0s - loss: 532.3595
Epoch 14/50
 - 0s - loss: 485.6549
Epoch 15/50
 - 0s - loss: 446.1509
Epoch 16/50
 - 0s - loss: 419.5221
Epoch 17/50
 - 0s - loss: 395.4603
Epoch 18/50
 - 0s - loss: 374.2475
Epoch 19/50
 - 0s - loss: 358.8069
Epoch 20/50
 - 0s - loss: 344.6829
Epoch 21/50
 - 0s - loss: 332.3613
Epoch 22/50
 - 0s - loss: 321.8178
Epoch 23/50
 - 0s - loss: 310.1268
Epoch 24/50
 - 0s - loss: 299.7942
Epoch 25/50
 - 0s - loss: 291.8848
Epoch 26/50
 - 0s - loss: 283.8694
Epoch 27/50
 - 0s - loss: 275.9855
Epoch 28/50
 - 0s - loss: 269.5969
Epoch 29/50
 - 0s - l

<keras.callbacks.History at 0x7f792efb5eb8>

In [13]:
# make prediction 
y_pred = model.predict(X_test)

### A - 3: Model Evaluation

In [14]:
# import mean squared error package from sklearn.metrics to cal MSE of y_pred
from sklearn.metrics import mean_squared_error
mean_squared_error (y_test, y_pred)

203.4455916156649

### A - 4: Repeating steps 1 - 3, 50 times


In [15]:
i = 1
MSE_list =[]
while i <= 50 :
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
    model = regression_model()
    model.fit(X_train, y_train, epochs=50, verbose = 0)
    y_pred = model.predict(X_test)
    MSE_list.append(mean_squared_error (y_test, y_pred))
    i+=1
    
MSE_list

[219.1861589772096,
 1002.8441271579193,
 310.42896668291695,
 172.97453398869422,
 148.81791396694936,
 1022.2528025667943,
 404.0626177699483,
 330.0848026626192,
 338.2454319865049,
 163.30798992681045,
 285.9958351859131,
 179.07911754842232,
 1409.872655574378,
 197.82463706044507,
 176.37046286327882,
 361.8119632236148,
 182.84566944207887,
 201.5371764772064,
 370.38476340340185,
 238.22804637453075,
 250.69218310365633,
 462.8910534780573,
 270.4477370816618,
 570.0486620337869,
 303.6035052732818,
 242.99929776959974,
 169.8237108608789,
 172.93524924328602,
 350.13010185190706,
 162.73571146143203,
 250.37340736789074,
 170.58077717588222,
 149.5072771994815,
 215.45968152546345,
 404.30863669307786,
 170.49945780019422,
 344.3044215735098,
 185.60365312310637,
 269.9248658348761,
 224.5368026428155,
 909.9269598063624,
 173.22337171513942,
 304.50987442558346,
 200.81476864866818,
 189.6651755897721,
 272.6502317422451,
 157.3413477867237,
 335.176789682742,
 359.1896688849

### A - 5: Mean & Standard deviation of 50 MSE values 

In [16]:
# importing statistics module to calculate mean and std for MSE_list
import statistics as st

MSE_mean = st.mean (MSE_list)
MSE_std = st.stdev(MSE_list)
print("MSE mean = {}, MSE std = {}".format(MSE_mean, MSE_std))

MSE mean = 323.2060109006396, MSE std = 250.684034463659


## B - Normalize the data

In [17]:
# Normalizing data by subtracting the mean and dividing by std
X_norm = (X - X.mean()) / X.std()
X_norm.head() #display first 5 rows

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate
0,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,0.862735,-1.217079
1,2.476712,-0.856472,-0.846733,-0.916319,-0.620147,1.055651,-1.217079
2,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829
3,0.491187,0.79514,-0.846733,2.174405,-1.038638,-0.526262,-2.239829
4,-0.790075,0.678079,-0.846733,0.488555,-1.038638,0.070492,0.647569


In [18]:
# Repeating part A on normalized data:
i = 1
MSE_list =[]
while i <= 50 :
    #using X_norm instead of X to split the data
    X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.30)
    model = regression_model()
    model.fit(X_train, y_train, epochs=50, verbose = 0)
    y_pred = model.predict(X_test)
    MSE_list.append(mean_squared_error (y_test, y_pred))
    i+=1
    
MSE_norm_mean = st.mean (MSE_list)
MSE_norm_std = st.stdev(MSE_list)

In [19]:
print("MSE norm mean = {}, MSE norm std = {}".format(MSE_norm_mean, MSE_norm_std))

MSE norm mean = 382.5396318882239, MSE norm std = 111.65800759510483


## C - Increase number of epochs to 100

In [20]:
# Repeating part A on normalized data - epochs 100:
i = 1
MSE_list =[]
while i <= 50 :
    #using X_norm instead of X to split the data
    X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.30)
    model = regression_model()
    # number of epochs changed to 100
    model.fit(X_train, y_train, epochs=50, verbose = 0)
    y_pred = model.predict(X_test)
    MSE_list.append(mean_squared_error (y_test, y_pred))
    i+=1
    
MSE_norm_100ep_mean = st.mean (MSE_list)
MSE_norm_100ep_std = st.stdev(MSE_list)
print("MSE mean for 100 ep = {}, MSE std for 100 ep = {}".format(MSE_norm_100ep_mean, MSE_norm_100ep_std))

MSE mean for 100 ep = 395.68950995727806, MSE std for 100 ep = 104.15814094012195


## D - Increase Number of hidden layers

In [21]:
# new regression_model with 3 hidden layers, 10 nodes and Relu
def regression_model2():
    # create model
    model = Sequential()
    model.add(Dense(10, activation='relu', input_shape=(7,)))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(1))
    
    # compile model with adam optimizer and MSE
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

In [22]:
# Repeating the exercise with normalized data and new settings

i = 1
MSE_list =[]
while i <= 50 :
    #using X_norm instead of X to split the data
    X_train, X_test, y_train, y_test = train_test_split(X_norm, y, test_size=0.30)
    # defining reg model no. 2 with new settings
    model = regression_model2()
    model.fit(X_train, y_train, epochs=50, verbose = 0)
    y_pred = model.predict(X_test)
    MSE_list.append(mean_squared_error (y_test, y_pred))
    i+=1
    
MSE_norm_new_mean = st.mean (MSE_list)
MSE_norm_new_std = st.stdev(MSE_list)
print("MSE new mean = {}, MSE new std = {}".format(MSE_norm_new_mean, MSE_norm_new_std))

MSE new mean = 172.90770938739558, MSE new std = 10.568057860161652
