In [262]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score,mean_absolute_error
from sklearn.preprocessing import StandardScaler
import cpi
import seaborn as sb
cpi.update()

In [196]:
# load data
data = pd.read_excel('./final_cleaned_dataset.xlsx')
data = pd.get_dummies(data, drop_first=True)

In [197]:
# Use CPI index to adjust cost of engineering services for inflation
data['Year of the project'] = pd.to_datetime(data['Year of the project'], format='%Y') 
data["Cost of engineering services"] = data.apply(
    lambda x: cpi.inflate(x["Cost of engineering services"], x["Year of the project"]), axis=1
)
data['Year of the project'] = data['Year of the project'].dt.year.astype('int32')
data = data.drop(['Year of the project'], axis=1)

In [198]:
# split data in test and train
X = data.drop('Cost of engineering services', axis=1) # Features
y = data['Cost of engineering services'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
sc = StandardScaler()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Apply tranformations
X_train = sc.fit_transform(X_train)
y_train = sc.transform(y_train.to_frame())
X_test = sc.transform(X_test)
y_test = sc.transform(y_test.to_frame())

In [199]:
# function to create model
def make_model():
    
    model = Sequential()
    model.add(Dense(units=4, input_dim=45, kernel_initializer='normal', activation='relu'))
#     model.add(Dense(units=5, activation='relu'))
#     model.add(Dense(units=5, activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_absolute_error', optimizer="adam")
    return model

In [200]:
# specify hyperparameters to test
params={
    'batch_size':[10, 20, 30, 40 , 100, 200, 300, 400],
    'epochs':[10,20, 30 ,40,100],
}

# specify the model and start grid search
reg_model = KerasRegressor(make_model, verbose=0)
gs = GridSearchCV(reg_model, params, refit=False, cv=4, scoring='r2')

gs.fit(X_train, y_train)
print(gs.best_score_, gs.best_params_)

0.44440252087287624 {'batch_size': 10, 'epochs': 100}


In [259]:
# evaluation function 
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    test_labels = sc.inverse_transform(test_labels.reshape(-1, 1))
    predictions = sc.inverse_transform(predictions.reshape(-1, 1))

    r2 = r2_score(test_labels, predictions)
    errors = abs(predictions - test_labels)
    print('Model Performance')
    print(f'RMSE: {mean_squared_error(test_labels, predictions, squared = False)}.')
    print(f'MAPE: {mean_absolute_percentage_error(test_labels, predictions)}%.')
    print(f'MAE: {mean_absolute_error(test_labels, predictions)}.')
    print(f'R2 = {r2}. \n')




In [320]:
# craete model using best parameters
model2 = Sequential()
model2.add(Dense(units=4, input_dim=45, kernel_initializer='normal', activation='relu'))
# model2.add(Dense(units=4, activation='relu'))
# model2.add(Dense(units=2, activation='relu'))
model2.add(Dense(1, kernel_initializer='normal'))
model2.compile(loss='mean_squared_error', optimizer="adam")
model2.fit(X_train, y_train ,batch_size =  gs.best_params_["batch_size"], epochs = gs.best_params_["epochs"], verbose=0)



<keras.callbacks.History at 0x23282fda8e0>

In [321]:
evaluate(model2,X_test, y_test)

Model Performance
RMSE: 47264.76963455298.
MAPE: 2.025539825840192%.
MAE: 42908.75289391101.
R2 = 0.6359666904824611. 

