In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from math import sqrt

In [2]:
pathToFile = r"./dataset/Training/"

training_files = ['Features_Variant_1.csv', 'Features_Variant_2.csv', 'Features_Variant_3.csv', 
                 'Features_Variant_4.csv', 'Features_Variant_5.csv']

In [3]:
for f in training_files:
    train = pd.concat([pd.read_csv(pathToFile + f)])
X_train_all = train.iloc[:,:-1]
y_train_all = train.iloc[:,-1:]

In [12]:
X_train_list = []
y_train_list = []

for i in range(5):
    X_train_list.append(pd.read_csv(pathToFile + training_files[i]).iloc[:,:-1])
    y_train_list.append(pd.read_csv(pathToFile + training_files[i]).iloc[:,-1:])

X_train_list.append(X_train_all)
y_train_list.append(y_train_all)

In [5]:
pathToTestFile = r"./dataset/Testing/TestSet/"

test_files = ['Test_Case_1.csv', 'Test_Case_2.csv', 'Test_Case_3.csv',
              'Test_Case_4.csv', 'Test_Case_5.csv', 'Test_Case_5.csv', 
              'Test_Case_6.csv', 'Test_Case_7.csv', 'Test_Case_8.csv', 
              'Test_Case_9.csv', 'Test_Case_10.csv']
for f in test_files:
    test = pd.concat([pd.read_csv(pathToTestFile + f)])
X_test = test.iloc[:,:-1]
y_test = test.iloc[:,-1:]

linear regression

In [15]:
from sklearn.linear_model import LinearRegression

lin_reg = LinearRegression()
for i in range(6):
    lin_reg.fit(X_train_list[i], y_train_list[i])
    
    rms = sqrt(mean_squared_error(y_test, lin_reg.predict(X_test)))
    if i == 5:
        print("Variant overall: ", rms)
    else:
        print("Variant {0}: ".format(i + 1), rms)

Variant 1:  76.03258920010072
Variant 2:  69.47677611571014
Variant 3:  65.81787923340246
Variant 4:  74.23472530706195
Variant 5:  88.85631226300191
Variant overall:  88.85631226300191


polynomial regression

In [22]:
from sklearn.preprocessing import PolynomialFeatures

poly_features = PolynomialFeatures(degree=2, include_bias=False)
lin_reg = LinearRegression()
for i in range(6):
    X_poly = poly_features.fit_transform(X_train_list[i])
    lin_reg.fit(X_poly, y_train_list[i])
    
    X_test_poly = poly_features.fit_transform(X_test)
    y_predicted = lin_reg.predict(X_test_poly)
    rms = sqrt(mean_squared_error(y_test, y_predicted))

    if i == 5:
        print("Variant overall: ", rms, lin_reg.intercept_, lin_reg.coef_)
    else:
        print("Variant {0}: ".format(i + 1), rms, lin_reg.intercept_, lin_reg.coef_)

Variant 1:  1019.9999792170482 [2.51932097] [[ 0.00554747 -0.00042355  0.00011052 ... -0.01033716  0.
  -0.00064123]]
Variant 2:  3126.8217006054656 [5.93792753] [[ 1.16814297e-03 -1.35146254e-04 -1.05747739e-05 ... -4.35661656e-03
   0.00000000e+00 -2.48107828e-03]]
Variant 3:  877.4251190142712 [5.38943998] [[ 0.01731977  0.00093048  0.00013384 ... -0.00655857  0.
   0.001872  ]]
Variant 4:  39273.488678663976 [2.16230066] [[-8.87853403e-03  6.40481775e-05  5.63909046e-05 ... -3.20918574e-03
   0.00000000e+00  2.53108536e-04]]
Variant 5:  7944.253730932758 [2.5572448] [[ 1.89499120e-03 -9.95960310e-06  1.54776992e-05 ... -4.06137204e-03
   0.00000000e+00 -9.39496261e-04]]
Variant overall:  7944.253730932758 [2.5572448] [[ 1.89499120e-03 -9.95960310e-06  1.54776992e-05 ... -4.06137204e-03
   0.00000000e+00 -9.39496261e-04]]


decision tree

In [33]:
from sklearn.tree import DecisionTreeRegressor

tree_reg = DecisionTreeRegressor(max_depth=10, random_state=42)

for i in range(6):
    tree_reg.fit(X_train_list[i], y_train_list[i])
    y_predicted = tree_reg.predict(X_test)
    rms = sqrt(mean_squared_error(y_test, y_predicted))

    if i == 5:
        print("Variant overall: ", rms, tree_reg.feature_importances_)
    else:
        print("Variant {0}: ".format(i + 1), rms, tree_reg.feature_importances_)

Variant 1:  65.34108120408506 [6.47378817e-03 1.09477460e-02 9.76065209e-03 1.31146117e-02
 1.73179874e-04 7.44311136e-03 1.33578148e-02 2.29561690e-03
 7.25733531e-03 0.00000000e+00 2.49863604e-03 3.83347089e-03
 1.28079071e-02 1.66719528e-02 0.00000000e+00 1.06102487e-02
 6.68750427e-03 6.77534683e-03 1.05976975e-03 2.45893315e-05
 2.94823622e-03 1.48797045e-03 7.14180424e-04 3.43301438e-02
 7.87364000e-03 1.13015295e-02 1.18648677e-03 1.79379321e-03
 7.96837185e-04 2.39257620e-02 2.91800463e-01 9.23203684e-04
 5.11230559e-03 4.76466327e-02 2.42337306e-01 1.48811080e-02
 1.42951544e-01 0.00000000e+00 0.00000000e+00 2.24979318e-04
 7.84774717e-04 0.00000000e+00 1.02906405e-02 7.49089532e-03
 1.23549358e-03 1.22236948e-03 2.71088861e-03 2.92736644e-05
 5.92990029e-03 1.25144021e-03 4.90428645e-03 4.96921286e-05
 7.09415097e-05]
Variant 2:  113.95279504785593 [2.57185723e-03 7.65331501e-03 5.65938911e-02 6.82071784e-03
 2.51202476e-03 1.22843320e-03 7.89736986e-02 2.15536885e-03
 4.6388

random forest

In [24]:
from sklearn.ensemble import RandomForestRegressor

rnd_clf = RandomForestRegressor(max_depth=8, random_state=0,
                                  n_estimators=100)

for i in range(6):
    rnd_clf.fit(X_train_list[i], y_train_list[i])
    y_pred_rf = rnd_clf.predict(X_test)
    rms = sqrt(mean_squared_error(y_test, y_pred_rf))
    
    if i == 5:
        print("Variant overall: ", rms, rnd_clf.feature_importances_)
    else:
        print("Variant {0}: ".format(i + 1), rms, rnd_clf.feature_importances_)
    

  import sys


Variant 1:  63.35019782228598 [1.04120609e-02 8.88918776e-03 8.75750945e-03 1.03695757e-02
 1.94965915e-03 7.90631750e-03 4.45927551e-03 1.00763012e-02
 7.34656703e-03 5.75121888e-04 6.48337785e-03 1.09773492e-02
 1.32092242e-02 1.54969976e-02 4.31420290e-07 1.26533657e-02
 9.27700542e-03 8.22446180e-03 8.14333170e-03 1.75500565e-03
 4.47292742e-03 6.28970886e-03 7.41260794e-03 1.06846444e-02
 1.12420334e-02 5.76763571e-03 1.56459326e-02 6.65242039e-03
 4.35883968e-03 2.26810686e-02 2.50950148e-01 5.63337418e-04
 2.60663916e-02 6.84965116e-02 2.27642547e-01 1.76485229e-02
 1.28522853e-01 0.00000000e+00 3.00870368e-04 1.97517517e-03
 1.20997943e-03 1.68555788e-03 3.51505760e-03 1.51341246e-03
 2.50912596e-03 1.20775783e-03 1.72410852e-03 1.68306227e-03
 1.21418369e-03 5.03816607e-03 1.78176739e-03 1.30679755e-03
 1.27472000e-03]


  import sys


Variant 2:  64.77159501851575 [1.32767562e-02 7.31102465e-03 3.07401277e-02 8.16170720e-03
 5.81650179e-04 7.52721584e-03 1.97764246e-02 1.48015014e-02
 9.09419629e-03 3.38644802e-04 6.10817140e-03 1.35996969e-02
 1.53312774e-02 8.75658652e-03 1.04157473e-07 5.81446683e-03
 1.78038260e-02 1.20090193e-02 8.58422581e-03 3.10136873e-04
 3.42483515e-03 2.68760050e-02 1.73995555e-02 6.91811291e-03
 1.64484267e-02 7.38599362e-03 1.36151932e-02 1.31017431e-02
 6.79778547e-03 1.59195847e-02 2.39897921e-01 1.28692009e-03
 1.78737002e-02 4.93733608e-02 2.12896180e-01 2.11049917e-02
 1.01792734e-01 0.00000000e+00 4.40632671e-04 1.18775096e-03
 2.37223609e-03 2.99976220e-03 2.01576467e-03 2.65113746e-03
 1.09121312e-03 1.35559784e-03 1.19793591e-03 3.01576852e-03
 2.64877785e-03 2.28967530e-03 2.17006234e-03 1.05914003e-03
 1.46474224e-03]


  import sys


Variant 3:  56.450809932794854 [1.35924410e-02 7.76944867e-03 1.14981988e-02 7.07646982e-03
 4.48613513e-04 9.22506167e-03 2.15233192e-02 1.24457230e-02
 7.15628180e-03 6.94273723e-04 3.77258071e-03 1.34367346e-02
 9.96650046e-03 1.11537876e-02 1.94986847e-10 5.23983872e-03
 3.19145003e-02 9.32421466e-03 6.18947844e-03 5.88011682e-04
 5.52830718e-03 2.83755583e-02 1.37352758e-02 5.84676851e-03
 5.41339939e-03 4.22466445e-03 1.29394495e-02 8.01339243e-03
 1.10686704e-02 2.24898993e-02 2.72688256e-01 6.19701331e-04
 2.32924475e-02 4.50886117e-02 2.05033414e-01 2.17171016e-02
 9.83408005e-02 0.00000000e+00 3.59897977e-04 1.90439285e-03
 2.75631733e-03 2.66269143e-03 1.25990961e-03 1.55169067e-03
 3.37822810e-03 1.14756065e-03 1.67062339e-03 1.99840645e-03
 6.42239558e-03 1.49291761e-03 1.55363457e-03 3.22981510e-03
 1.18032199e-03]


  import sys


Variant 4:  75.37738951095992 [1.57246638e-02 7.58738236e-03 1.36607351e-02 9.90262163e-03
 8.61726020e-04 1.45401146e-02 6.77216936e-03 1.55468392e-02
 1.11748356e-02 8.16694091e-04 4.45127931e-03 1.49320663e-02
 1.07438858e-02 1.40908581e-02 4.91931514e-10 6.41978694e-03
 1.36632035e-02 8.46461530e-03 2.20569334e-02 7.22555570e-04
 6.65534636e-03 1.28481875e-02 1.29553652e-02 7.19052942e-03
 9.33672530e-03 4.97329899e-03 1.57551088e-02 9.14799305e-03
 1.11108567e-02 1.64549843e-02 2.84547876e-01 5.39082015e-04
 2.04740608e-02 2.29441560e-02 2.01279943e-01 3.02615171e-02
 1.11102453e-01 0.00000000e+00 4.29224013e-04 1.22313791e-03
 2.10666295e-03 3.67241392e-03 2.40301377e-03 1.67840989e-03
 2.00711979e-03 1.83566697e-03 1.45020276e-03 1.85391515e-03
 2.48898019e-03 3.80238911e-03 1.77350585e-03 1.74444053e-03
 1.82046701e-03]


  import sys


Variant 5:  63.82691182361886 [1.31549804e-02 9.39310153e-03 1.03746489e-02 7.74811097e-03
 3.01992697e-04 1.32340989e-02 2.28671868e-02 1.16326714e-02
 8.62394830e-03 5.05704054e-04 3.38396656e-03 2.65665126e-02
 1.20102296e-02 9.24428142e-03 1.98467211e-07 3.39653658e-03
 2.28248521e-02 7.56950839e-03 9.74325217e-03 3.35265341e-04
 5.25398958e-03 2.03977254e-02 1.03222648e-02 4.32081903e-03
 7.50873272e-03 4.31490083e-03 1.06413339e-02 5.70324227e-03
 8.79408103e-03 2.50004021e-02 2.86376738e-01 6.39404564e-04
 2.38951028e-02 3.78325068e-02 2.15406734e-01 2.45615152e-02
 8.46863422e-02 0.00000000e+00 3.44772531e-04 1.79361530e-03
 2.65188002e-03 3.58843322e-03 2.37576722e-03 1.28896950e-03
 1.50946491e-03 1.86978773e-03 1.90326323e-03 1.93482920e-03
 1.89754395e-03 3.58005154e-03 2.63179340e-03 1.72025890e-03
 2.34268671e-03]


  import sys


Variant overall:  63.82691182361886 [1.31549804e-02 9.39310153e-03 1.03746489e-02 7.74811097e-03
 3.01992697e-04 1.32340989e-02 2.28671868e-02 1.16326714e-02
 8.62394830e-03 5.05704054e-04 3.38396656e-03 2.65665126e-02
 1.20102296e-02 9.24428142e-03 1.98467211e-07 3.39653658e-03
 2.28248521e-02 7.56950839e-03 9.74325217e-03 3.35265341e-04
 5.25398958e-03 2.03977254e-02 1.03222648e-02 4.32081903e-03
 7.50873272e-03 4.31490083e-03 1.06413339e-02 5.70324227e-03
 8.79408103e-03 2.50004021e-02 2.86376738e-01 6.39404564e-04
 2.38951028e-02 3.78325068e-02 2.15406734e-01 2.45615152e-02
 8.46863422e-02 0.00000000e+00 3.44772531e-04 1.79361530e-03
 2.65188002e-03 3.58843322e-03 2.37576722e-03 1.28896950e-03
 1.50946491e-03 1.86978773e-03 1.90326323e-03 1.93482920e-03
 1.89754395e-03 3.58005154e-03 2.63179340e-03 1.72025890e-03
 2.34268671e-03]


gradient boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gbrt = GradientBoostingRegressor(max_depth=6, n_estimators=3, learning_rate=1.0, random_state=42)

for i in range(6):
    gbrt.fit(X_train, y_train)
    y_predicted = gbrt.predict(X_test)
    rms = sqrt(mean_squared_error(y_test, y_predicted))
    
    if i == 5:
        print("Variant overall: ", rms)
    else:
        print("Variant {0}: ".format(i + 1), rms)

neural net

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler 

In [None]:
scaler = StandardScaler()
scaler.fit(X_train)
classifier = MLPRegressor(alpha = 1e-4, hidden_layer_sizes = (150,5,), 
random_state = 12, max_iter = 500, activation = 'relu',
verbose = True, early_stopping = True, learning_rate_init = 0.001)
mlp = classifier.fit(scaler.transform(X_train_all), y_train_all)

In [None]:
predict_labels = mlp.predict(scaler.transform(X_test))
rms = sqrt(mean_squared_error(y_test, predict_labels))
rms