In [64]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from keras.models import Sequential
from keras.layers import Dense
from statistics import mean

In [65]:
#load the dataset
concrete_data = pd.read_csv('concrete_data.csv')
concrete_data.head()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
0,540.0,0.0,0.0,162.0,2.5,1040.0,676.0,28,79.99
1,540.0,0.0,0.0,162.0,2.5,1055.0,676.0,28,61.89
2,332.5,142.5,0.0,228.0,0.0,932.0,594.0,270,40.27
3,332.5,142.5,0.0,228.0,0.0,932.0,594.0,365,41.05
4,198.6,132.4,0.0,192.0,0.0,978.4,825.5,360,44.3


In [66]:
#checking basic stats of each column
concrete_data.describe()

Unnamed: 0,Cement,Blast Furnace Slag,Fly Ash,Water,Superplasticizer,Coarse Aggregate,Fine Aggregate,Age,Strength
count,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0,1030.0
mean,281.167864,73.895825,54.18835,181.567282,6.20466,972.918932,773.580485,45.662136,35.817961
std,104.506364,86.279342,63.997004,21.354219,5.973841,77.753954,80.17598,63.169912,16.705742
min,102.0,0.0,0.0,121.8,0.0,801.0,594.0,1.0,2.33
25%,192.375,0.0,0.0,164.9,0.0,932.0,730.95,7.0,23.71
50%,272.9,22.0,0.0,185.0,6.4,968.0,779.5,28.0,34.445
75%,350.0,142.95,118.3,192.0,10.2,1029.4,824.0,56.0,46.135
max,540.0,359.4,200.1,247.0,32.2,1145.0,992.6,365.0,82.6


In [67]:
#checking for Null values across data
concrete_data.isnull().sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [68]:
#checking for negative values across data
concrete_data.lt(0).sum()

Cement                0
Blast Furnace Slag    0
Fly Ash               0
Water                 0
Superplasticizer      0
Coarse Aggregate      0
Fine Aggregate        0
Age                   0
Strength              0
dtype: int64

In [69]:
#In the instructions, only 7 predictors to be used, hence 'Age' to be dropped
concrete_data = concrete_data.drop(['Age'],axis=1)

# PART A

In [73]:
#lets initiate the list where all the mean_squared_errors will be appended to
mse_partA = []

#The loop code is per assignmed (steps 1-3), which is really computationally long due to splitting/training the data 50 times

for i in range (50):

    #Lets split data using train_test_split function from sklearn with 30% saved for testing
    concrete_data_columns = concrete_data.columns
    X = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] #predictors
    y = concrete_data['Strength'] #Target
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

    #number of inputs to be used for modelling
    n_cols = X.shape[1]

    #define regression model
    def regression_model():
        # create model
        model = Sequential()
        model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
        model.add(Dense(1))

        # compile model
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model

    model = regression_model()

    #fit the model on the train data
    model.fit(X_train, y_train, epochs=50, verbose=0)

    #find the predicted Strength using test data
    predicted_y = model.predict(X_test,verbose=0)

    #estimate MSE between test and predicted Strength values
    mse = mean_squared_error(y_test,predicted_y)
    mse_partA.append(mse)
    
    i+1

In [71]:
mean_partA = mean(mse_partA)
std_partA = np.std(mse_partA)
print ("Mean of the MSE list = ", mean_partA,".Standard deviation of the MSE list = ", std_partA)

Mean of the MSE list =  449.387111239407 .Standard deviation of the MSE list =  189.46958364863752


# PART B

In [75]:
#Similarly to Part A, but in this case let's normalize the input dataset

mse_partB = []

for i in range (50):

    #Lets split data using train_test_split function from sklearn with 30% saved for testing
    concrete_data_columns = concrete_data.columns
    X = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] #predictors
    X = (X - X.mean())/X.std()
    y = concrete_data['Strength'] #Target
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

    #number of inputs to be used for modelling
    n_cols = X.shape[1]

    #define regression model
    def regression_model():
        # create model
        model = Sequential()
        model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
        model.add(Dense(1))

        # compile model
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model

    model = regression_model()

    #fit the model on the train data
    model.fit(X_train, y_train, epochs=50, verbose=0)

    #find the predicted Strength using test data
    predicted_y = model.predict(X_test,verbose=0)

    #estimate MSE between test and predicted Strength values
    mse = mean_squared_error(y_test,predicted_y)
    mse_partB.append(mse)
    
    i+1

In [76]:
mean_partB = mean(mse_partB)
std_partB = np.std(mse_partB)
print ("Mean of the MSE list = ", mean_partB,".Standard deviation of the MSE list = ", std_partB)

Mean of the MSE list =  357.1215232464367 .Standard deviation of the MSE list =  83.0665694588685


In [86]:
PartB = pd.DataFrame({"PartA":[mean_partA, std_partA],
                 "PartB":[mean_partB,std_partB]})
PartB.index = ['Mean of errors',"Standard deviation"]
PartB.head()

Unnamed: 0,PartA,PartB
Mean of errors,449.387111,357.121523
Standard deviation,189.469584,83.066569


# PART C

In [77]:
#For Part C the epochs are increased

mse_partC = []

for i in range (50):

    #Lets split data using train_test_split function from sklearn with 30% saved for testing
    concrete_data_columns = concrete_data.columns
    X = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] #predictors
    X = (X - X.mean())/X.std()
    y = concrete_data['Strength'] #Target
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

    #number of inputs to be used for modelling
    n_cols = X.shape[1]

    #define regression model
    def regression_model():
        # create model
        model = Sequential()
        model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
        model.add(Dense(1))

        # compile model
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model

    model = regression_model()

    #fit the model on the train data
    model.fit(X_train, y_train, epochs=100, verbose=0)

    #find the predicted Strength using test data
    predicted_y = model.predict(X_test,verbose=0)

    #estimate MSE between test and predicted Strength values
    mse = mean_squared_error(y_test,predicted_y)
    mse_partC.append(mse)
    
    i+1

In [78]:
mean_partC = mean(mse_partC)
std_partC = np.std(mse_partC)
print ("Mean of the MSE list = ", mean_partC,".Standard deviation of the MSE list = ", std_partC)

Mean of the MSE list =  197.37828597991225 .Standard deviation of the MSE list =  23.148584925072356


In [87]:
PartC = pd.DataFrame({"PartC":[mean_partC, std_partC],
                 "PartB":[mean_partB,std_partB]})
PartC.index = ['Mean of errors',"Standard deviation"]
PartC.head()

Unnamed: 0,PartC,PartB
Mean of errors,197.378286,357.121523
Standard deviation,23.148585,83.066569


# PART D

In [80]:
#Increasing number of hidden layers with ReLu function and 50epochs
mse_partD = []

for i in range (50):

    #Lets split data using train_test_split function from sklearn with 30% saved for testing
    concrete_data_columns = concrete_data.columns
    X = concrete_data[concrete_data_columns[concrete_data_columns != 'Strength']] #predictors
    X = (X - X.mean())/X.std()
    y = concrete_data['Strength'] #Target
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3)

    #number of inputs to be used for modelling
    n_cols = X.shape[1]

    #define regression model
    def regression_model():
        # create model
        model = Sequential()
        model.add(Dense(10, activation='relu', input_shape=(n_cols,)))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(10, activation='relu'))
        model.add(Dense(1))

        # compile model
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model

    model = regression_model()

    #fit the model on the train data
    model.fit(X_train, y_train, epochs=50, verbose=0)

    #find the predicted Strength using test data
    predicted_y = model.predict(X_test,verbose=0)

    #estimate MSE between test and predicted Strength values
    mse = mean_squared_error(y_test,predicted_y)
    mse_partD.append(mse)
    
    i+1

In [81]:
mean_partD = mean(mse_partD)
std_partD = np.std(mse_partD)
print ("Mean of the MSE list = ", mean_partD,".Standard deviation of the MSE list = ", std_partD)

Mean of the MSE list =  170.62194667318835 .Standard deviation of the MSE list =  9.673544935995686


In [88]:
PartD = pd.DataFrame({"PartC":[mean_partD, std_partD],
                 "PartB":[mean_partB,std_partB]})
PartD.index = ['Mean of errors',"Standard deviation"]
PartD.head()

Unnamed: 0,PartC,PartB
Mean of errors,170.621947,357.121523
Standard deviation,9.673545,83.066569


In [None]:
#The neural network with the 3 hidden layers and 50 epochs (methodC) provided the best predictions of the cement Strength.