In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import max_error
from sklearn.metrics import r2_score
import numpy as np
from timeit import default_timer as timer

In [2]:
df = pd.read_csv("data/bike.csv",sep=',')
df = df.sample(frac=1).reset_index(drop=True)
# Convert datetime to day
df["dteday"] = [ pd.to_datetime(x).day for x in df.dteday.values]
# Rename datetime column
newColumnNames = df.columns.values
newColumnNames[1] = "day"
df.columns = newColumnNames
# Extract ground truth
gt = df.cnt.values
# Remove target and useless columns
df = df.drop(['instant','casual','registered','cnt'],axis=1)
df

Unnamed: 0,day,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,7,4,1,10,21,0,0,0,1,0.38,0.3939,0.87,0.1045
1,10,4,1,12,18,0,1,1,2,0.46,0.4545,0.88,0.2836
2,13,2,0,6,23,0,1,1,1,0.62,0.6212,0.46,0.1940
3,19,4,0,10,21,0,3,1,2,0.58,0.5455,0.94,0.1343
4,20,2,0,5,20,0,5,1,1,0.58,0.5455,0.64,0.1045
5,10,3,1,9,2,0,1,1,1,0.54,0.5152,0.60,0.3582
6,23,3,1,8,12,0,4,1,1,0.78,0.6818,0.38,0.0000
7,5,3,0,7,15,0,2,1,1,0.82,0.7424,0.43,0.0000
8,21,2,0,5,8,0,6,0,1,0.56,0.5303,0.68,0.2836
9,17,3,1,8,3,0,5,1,1,0.64,0.2424,0.65,0.1045


In [4]:
train = []
trainGt = []
test = []
testGt = []
values = df.values
trainCount = len(df) * 0.85
for x in range(len(df)):
    if x < trainCount:
        train.append(values[x])
        trainGt.append(gt[x])
    else:
        test.append(values[x])
        testGt.append(gt[x])
assert len(train) == len(trainGt)
assert len(test) == len(testGt)
print(f"Train size: {len(train)} Test size: {len(test)}")

Train size: 14773 Test size: 2606


In [5]:
def test_regressor(name,regressor):
    start = timer()
    regressor.fit(train, trainGt)
    fitTime = timer() - start
    predictedValues = regressor.predict(test)
    r2Score = regressor.score(test, testGt)
    mse = mean_squared_error(testGt, predictedValues)
    maxE = max_error(testGt, predictedValues)
    return { "name": name, "r2": r2Score, "mse": mse, "max_error": maxE, "fit_time": fitTime }

In [6]:
results = []

In [7]:
# Ridge classifier
rg0 = Ridge(alpha=0.5, normalize=False)
results.append(test_regressor("Ridge_a05.0_nF",rg0))
rg1 = Ridge(alpha=1.0, normalize=False)
results.append(test_regressor("Ridge_a1.0_nF",rg1))
rg2 = Ridge(alpha=1.0, normalize=True)
results.append(test_regressor("Ridge_a1.0_nT",rg2))
results

[{'name': 'Ridge_a05.0_nF',
  'r2': 0.3717463494130223,
  'mse': 20357.735090005604,
  'max_error': 642.3800159556346,
  'fit_time': 0.06428259999999852},
 {'name': 'Ridge_a1.0_nF',
  'r2': 0.3717656730166874,
  'mse': 20357.1089339872,
  'max_error': 642.4301415621103,
  'fit_time': 0.013584600000001501},
 {'name': 'Ridge_a1.0_nT',
  'r2': 0.3222899417045074,
  'mse': 21960.305080156217,
  'max_error': 678.6306447403126,
  'fit_time': 0.013396200000002523}]

In [8]:
# DecisionTreeRegressor
dtr_mse = DecisionTreeRegressor(criterion="mse")
results.append(test_regressor("DTR_mse",dtr_mse))

dtr_fried = DecisionTreeRegressor(criterion="friedman_mse")
results.append(test_regressor("DTR_fried",dtr_fried))

dtr_mae = DecisionTreeRegressor(criterion="mae")
results.append(test_regressor("DTR_mae",dtr_mae))

In [9]:
# SGD regression
# https://scikit-learn.org/stable/modules/sgd.html#regression
sgdr_def = SGDRegressor(penalty='l2',max_iter=10000)
sgdr_elastic = SGDRegressor(penalty='elasticnet',max_iter=10000)
results.append(test_regressor("SGD_l2_10000it",sgdr_def))
results.append(test_regressor("SGD_elasticnet_10000it",sgdr_elastic))



In [10]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor
mlpResults = []
mlpr_tanh = MLPRegressor(hidden_layer_sizes=(5,4,3),activation='tanh')
mlpResults.append(test_regressor("mlpr_tanh_543", mlpr_tanh))
mlpr_relu = MLPRegressor(hidden_layer_sizes=(5,4,3),activation='relu')
mlpResults.append(test_regressor("mlpr_relu_543", mlpr_relu))
mlpr_relu2 = MLPRegressor(hidden_layer_sizes=(5,4,3,2),activation='relu')
mlpResults.append(test_regressor("mlpr_relu_5432", mlpr_relu2))
mlpr_relu3 = MLPRegressor(hidden_layer_sizes=(5,4,3,2),activation='relu',max_iter=500)
mlpResults.append(test_regressor("mlpr_relu_5432_500it", mlpr_relu3))

mlpDf=pd.DataFrame(mlpResults)
mlpDf



Unnamed: 0,fit_time,max_error,mse,name,r2
0,6.518127,916.153125,48811.877282,mlpr_tanh_543,-0.506368
1,5.646779,583.207163,14111.179256,mlpr_relu_543,0.564519
2,6.732425,666.637026,19716.83293,mlpr_relu_5432,0.391525
3,20.05524,480.720554,4784.287469,mlpr_relu_5432_500it,0.852354


In [12]:
results += mlpResults
results

[{'name': 'Ridge_a05.0_nF',
  'r2': 0.3717463494130223,
  'mse': 20357.735090005604,
  'max_error': 642.3800159556346,
  'fit_time': 0.06428259999999852},
 {'name': 'Ridge_a1.0_nF',
  'r2': 0.3717656730166874,
  'mse': 20357.1089339872,
  'max_error': 642.4301415621103,
  'fit_time': 0.013584600000001501},
 {'name': 'Ridge_a1.0_nT',
  'r2': 0.3222899417045074,
  'mse': 21960.305080156217,
  'max_error': 678.6306447403126,
  'fit_time': 0.013396200000002523},
 {'name': 'DTR_mse',
  'r2': 0.8843570078790328,
  'mse': 3747.2594013814273,
  'max_error': 635.0,
  'fit_time': 0.1044399000000027},
 {'name': 'DTR_fried',
  'r2': 0.881896786015572,
  'mse': 3826.9796623177285,
  'max_error': 635.0,
  'fit_time': 0.08392070000000729},
 {'name': 'DTR_mae',
  'r2': 0.8734016304828349,
  'mse': 4102.254029163469,
  'max_error': 650.0,
  'fit_time': 5.2503265},
 {'name': 'SGD_l2_10000it',
  'r2': -0.1623438550762335,
  'mse': 37664.22727990481,
  'max_error': 3028.690625360925,
  'fit_time': 2.43088

In [13]:
# https://scikit-learn.org/stable/modules/svm.html#regression
svmResults = []
svm0 = SVR(kernel='rbf', max_iter=10000,C=10000)
svmResults.append(test_regressor("SVM_rbf_10000it_10000C", svm0))

svm4 = SVR(kernel='rbf', max_iter=20000,C=1000)
svmResults.append(test_regressor("SVM_rbf_20000it_1000C", svm4))

svm1 = SVR(kernel='rbf', max_iter=20000,C=10000)
svmResults.append(test_regressor("SVM_rbf_20000it_10000C", svm1))

svm2 = SVR(kernel='rbf', max_iter=20000,C=100000)
svmResults.append(test_regressor("SVM_rbf_20000it_100000C", svm2))



In [14]:
results += svmResults

In [15]:
# results = results[:len(results)-4]
resultsDf = pd.DataFrame(results) # results
resultsDf = resultsDf.set_index('name')
resultsDf.to_csv("regression.csv")
resultsDf.sort_values(by=['r2'])
resultsDf

Unnamed: 0_level_0,fit_time,max_error,mse,r2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Ridge_a05.0_nF,0.064283,642.380016,20357.73509,0.371746
Ridge_a1.0_nF,0.013585,642.430142,20357.108934,0.371766
Ridge_a1.0_nT,0.013396,678.630645,21960.30508,0.32229
DTR_mse,0.10444,635.0,3747.259401,0.884357
DTR_fried,0.083921,635.0,3826.979662,0.881897
DTR_mae,5.250326,650.0,4102.254029,0.873402
SGD_l2_10000it,2.430882,3028.690625,37664.22728,-0.162344
SGD_elasticnet_10000it,21.828808,13496.345027,520216.65912,-15.054242
mlpr_tanh_543,6.518127,916.153125,48811.877282,-0.506368
mlpr_relu_543,5.646779,583.207163,14111.179256,0.564519


In [17]:
resultsDf = resultsDf.sort_values(by=['r2'],ascending=False)
resultsDf

Unnamed: 0_level_0,fit_time,max_error,mse,r2
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
SVM_rbf_20000it_1000C,12.899671,526.175279,3017.75024,0.90687
SVM_rbf_20000it_10000C,10.229437,416.435406,3468.033221,0.892974
DTR_mse,0.10444,635.0,3747.259401,0.884357
DTR_fried,0.083921,635.0,3826.979662,0.881897
SVM_rbf_10000it_10000C,5.060657,404.969501,4073.644573,0.874285
DTR_mae,5.250326,650.0,4102.254029,0.873402
mlpr_relu_5432_500it,20.05524,480.720554,4784.287469,0.852354
SVM_rbf_20000it_100000C,8.616555,498.534794,4887.91132,0.849156
mlpr_relu_543,5.646779,583.207163,14111.179256,0.564519
mlpr_relu_5432,6.732425,666.637026,19716.83293,0.391525


In [49]:
plotDf = pd.read_csv('regression.csv')
plotDf['Index'] = np.arange(len(plotDf))
plotDf['rn'] = np.arange(len(plotDf))
plotDf = plotDf.set_index('Index')
plotDf

Unnamed: 0_level_0,name,fit_time,max_error,mse,r2,rn
Index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Ridge_a05.0_nF,0.015589,606.381548,20075.877384,0.375098,0
1,Ridge_a1.0_nF,0.023004,606.510351,20074.728442,0.375134,1
2,Ridge_a1.0_nT,0.015561,626.558345,21590.93121,0.327939,2
3,DTR_mse,0.104971,584.0,3344.090177,0.895908,3
4,DTR_fried,0.087226,584.0,3340.648503,0.896016,4
5,DTR_mae,5.771212,584.0,3592.623177,0.888172,5
6,SGD_l2_10000it,2.861409,2633.617537,38910.1296,-0.211156,6
7,SGD_elasticnet_10000it,0.896644,2513.197457,38300.380278,-0.192176,7
8,mlpr_tanh_543,5.765518,892.500202,47974.946405,-0.493316,8
9,mlpr_relu_543,5.321379,487.075342,11809.788314,0.632397,9
