In [33]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import max_error
from sklearn.metrics import r2_score
import numpy as np
from timeit import default_timer as timer

In [3]:
df = pd.read_csv("data/bike.csv",sep=',')
df = df.sample(frac=1).reset_index(drop=True)
# Convert datetime to day
df["dteday"] = [ pd.to_datetime(x).day for x in df.dteday.values]
# Rename datetime column
newColumnNames = df.columns.values
newColumnNames[1] = "day"
df.columns = newColumnNames
# Extract ground truth
gt = df.cnt.values
# Remove target columns and useless
df = df.drop(['instant','casual','registered','cnt'],axis=1)
df

Unnamed: 0,day,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed
0,14,2,0,6,3,0,2,1,1,0.58,0.5455,0.53,0.3284
1,7,3,1,7,18,0,6,0,1,0.94,0.8333,0.29,0.0896
2,1,1,0,1,14,0,6,0,2,0.46,0.4545,0.72,0.2836
3,28,4,0,9,15,0,3,1,1,0.70,0.6515,0.70,0.1343
4,11,3,1,9,4,0,2,1,1,0.48,0.4697,0.72,0.1045
5,1,3,0,8,13,0,1,1,1,0.90,0.8030,0.31,0.2985
6,9,1,1,3,5,0,5,1,3,0.40,0.4091,0.66,0.2836
7,14,2,1,4,18,0,6,0,2,0.62,0.6212,0.35,0.2239
8,21,2,0,5,7,0,6,0,1,0.54,0.5152,0.73,0.1343
9,25,3,0,8,5,0,4,1,1,0.66,0.6061,0.83,0.4179


In [4]:
train = []
trainGt = []
test = []
testGt = []
values = df.values
trainCount = len(df) * 0.85
for x in range(len(df)):
    if x < trainCount:
        train.append(values[x])
        trainGt.append(gt[x])
    else:
        test.append(values[x])
        testGt.append(gt[x])
assert len(train) == len(trainGt)
assert len(test) == len(testGt)
print(f"Train size: {len(train)} Test size: {len(test)}")

Train size: 14773 Test size: 2606


In [12]:
def test_regressor(name,regressor):
    start = timer()
    regressor.fit(train, trainGt)
    fitTime = timer() - start
    predictedValues = regressor.predict(test)
    r2Score = regressor.score(test, testGt)
    mse = mean_squared_error(testGt, predictedValues)
    maxE = max_error(testGt, predictedValues)
    return { "name": name, "r2": r2Score, "mse": mse, "max_error": maxE, "fit_time": fitTime }

In [14]:
results = []

In [15]:
# Ridge classifier
rg0 = Ridge(alpha=0.5, normalize=False)
results.append(test_regressor("Ridge_a05.0_nF",rg0))
rg1 = Ridge(alpha=1.0, normalize=False)
results.append(test_regressor("Ridge_a1.0_nF",rg1))
rg2 = Ridge(alpha=1.0, normalize=True)
results.append(test_regressor("Ridge_a1.0_nT",rg2))
results

[{'name': 'Ridge_a05.0_nF',
  'r2': 0.37509803328112856,
  'mse': 20075.877384058058,
  'max_error': 606.3815475503839,
  'fit_time': 0.015588799999932235},
 {'name': 'Ridge_a1.0_nF',
  'r2': 0.37513379641589395,
  'mse': 20074.72844174925,
  'max_error': 606.5103505401369,
  'fit_time': 0.023003700000117533},
 {'name': 'Ridge_a1.0_nT',
  'r2': 0.32793894292882375,
  'mse': 21590.931209904807,
  'max_error': 626.5583446459151,
  'fit_time': 0.015561100000013539}]

In [16]:
# DecisionTreeRegressor
dtr_mse = DecisionTreeRegressor(criterion="mse")
results.append(test_regressor("DTR_mse",dtr_mse))

dtr_fried = DecisionTreeRegressor(criterion="friedman_mse")
results.append(test_regressor("DTR_fried",dtr_fried))

dtr_mae = DecisionTreeRegressor(criterion="mae")
results.append(test_regressor("DTR_mae",dtr_mae))

In [17]:
# SGD regression
# https://scikit-learn.org/stable/modules/sgd.html#regression
sgdr_def = SGDRegressor(penalty='l2',max_iter=10000)
sgdr_elastic = SGDRegressor(penalty='elasticnet',max_iter=10000)
results.append(test_regressor("SGD_l2_10000it",sgdr_def))
results.append(test_regressor("SGD_elasticnet_10000it",sgdr_elastic))

In [19]:
# https://scikit-learn.org/stable/modules/generated/sklearn.neural_network.MLPRegressor.html#sklearn.neural_network.MLPRegressor
mlpResults = []
mlpr_tanh = MLPRegressor(hidden_layer_sizes=(5,4,3),activation='tanh')
mlpResults.append(test_regressor("mlpr_tanh_543", mlpr_tanh))
mlpr_relu = MLPRegressor(hidden_layer_sizes=(5,4,3),activation='relu')
mlpResults.append(test_regressor("mlpr_relu_543", mlpr_relu))
mlpr_relu2 = MLPRegressor(hidden_layer_sizes=(5,4,3,2),activation='relu')
mlpResults.append(test_regressor("mlpr_relu_5432", mlpr_relu2))
mlpr_relu3 = MLPRegressor(hidden_layer_sizes=(5,4,3,2),activation='relu',max_iter=500)
mlpResults.append(test_regressor("mlpr_relu_5432_500it", mlpr_relu3))

mlpDf=pd.DataFrame(mlpResults)
mlpDf



Unnamed: 0,fit_time,max_error,mse,name,r2
0,5.765518,892.500202,47974.946405,mlpr_tanh_543,-0.493316
1,5.321379,487.075342,11809.788314,mlpr_relu_543,0.632397
2,8.591269,939.296157,61947.171786,mlpr_relu_5432,-0.92823
3,19.660778,471.694724,6659.950859,mlpr_relu_5432_500it,0.792696


In [21]:
# results += mlpResults
results

[{'name': 'Ridge_a05.0_nF',
  'r2': 0.37509803328112856,
  'mse': 20075.877384058058,
  'max_error': 606.3815475503839,
  'fit_time': 0.015588799999932235},
 {'name': 'Ridge_a1.0_nF',
  'r2': 0.37513379641589395,
  'mse': 20074.72844174925,
  'max_error': 606.5103505401369,
  'fit_time': 0.023003700000117533},
 {'name': 'Ridge_a1.0_nT',
  'r2': 0.32793894292882375,
  'mse': 21590.931209904807,
  'max_error': 626.5583446459151,
  'fit_time': 0.015561100000013539},
 {'name': 'DTR_mse',
  'r2': 0.8959084831904104,
  'mse': 3344.0901765157328,
  'max_error': 584.0,
  'fit_time': 0.10497069999996711},
 {'name': 'DTR_fried',
  'r2': 0.8960156121703402,
  'mse': 3340.6485034535685,
  'max_error': 584.0,
  'fit_time': 0.08722580000016933},
 {'name': 'DTR_mae',
  'r2': 0.8881723948490133,
  'mse': 3592.6231772831925,
  'max_error': 584.0,
  'fit_time': 5.771211900000026},
 {'name': 'SGD_l2_10000it',
  'r2': -0.21115585870078935,
  'mse': 38910.12959989505,
  'max_error': 2633.617536673071,
  'f

In [28]:
# https://scikit-learn.org/stable/modules/svm.html#regression
svmResults = []
svm0 = SVR(kernel='rbf', max_iter=10000,C=10000)
svmResults.append(test_regressor("SVM_rbf_10000it_10000C", svm0))

svm4 = SVR(kernel='rbf', max_iter=20000,C=1000)
svmResults.append(test_regressor("SVM_rbf_20000it_1000C", svm4))

svm1 = SVR(kernel='rbf', max_iter=20000,C=10000)
svmResults.append(test_regressor("SVM_rbf_20000it_10000C", svm1))

svm2 = SVR(kernel='rbf', max_iter=20000,C=100000)
svmResults.append(test_regressor("SVM_rbf_20000it_100000C", svm2))



In [30]:
results += svmResults

In [32]:
# results = results[:len(results)-4]
resultsDf = pd.DataFrame(results) # results
resultsDf = resultsDf.set_index('name')
resultsDf.to_csv("regression.csv")