## Decision Forest Regression

Based on the AzureML Experiment:  Decision Forest Regression


In [69]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, accuracy_score
from sklearn.linear_model import Ridge
from scipy.stats import spearmanr, pearsonr

You are provided hourly rental data spanning two years. For this competition, the training set is comprised of the first 19 days of each month, while the test set is the 20th to the end of the month. You must predict the total count of bikes rented during each hour covered by the test set, using only information available prior to the rental period.

Data Fields

datetime - hourly date + timestamp  

season -  1 = spring, 2 = summer, 3 = fall, 4 = winter 

holiday - whether the day is considered a holiday

workingday - whether the day is neither a weekend nor holiday

weather - 1: Clear, Few clouds, Partly cloudy, Partly cloudy 

2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 

3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 

4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog 

temp - temperature in Celsius

atemp - "feels like" temperature in Celsius

humidity - relative humidity

windspeed - wind speed

casual - number of non-registered user rentals initiated

registered - number of registered user rentals initiated

count - number of total rentals

[Link to DataSet](https://www.kaggle.com/c/bike-sharing-demand/data)

In [70]:
df = pd.read_csv('./data/azureml/Bike_Rental_UCI_dataset.csv')
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1


In [71]:
df.shape

(17379, 17)

In [72]:
def day_of_week():  
    ## First day in the dataset is Saturday
    days = pd.DataFrame([[0, 1, 2, 3, 4, 5, 6],
      ["Sun", "Mon", "Tue", "Wed", "Thr", "Fri", "Sat"]]).transpose()
    days.columns = ['weekday', 'dayOfWeek']                        
    return days  

In [73]:
days_df = day_of_week()
days_df.head()

Unnamed: 0,weekday,dayOfWeek
0,0,Sun
1,1,Mon
2,2,Tue
3,3,Wed
4,4,Thr


In [74]:
df = pd.merge(df, days_df, on='weekday', how='outer')
df.head()

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,dayOfWeek
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16,Sat
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40,Sat
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32,Sat
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0,3,10,13,Sat
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0,0,1,1,Sat


In [75]:
df.shape

(17379, 18)

In [76]:
# determine the number of integer days since the first record.
def set_days(df):  
    import pandas as pd
    df['days'] = pd.Series(range(df.shape[0]))/24
    df['days'] = df['days'].astype('int')
    return df  

In [77]:
set_days(df)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt,dayOfWeek,days
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0000,3,13,16,Sat,0
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.80,0.0000,8,32,40,Sat,0
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.80,0.0000,5,27,32,Sat,0
3,4,2011-01-01,1,0,1,3,0,6,0,1,0.24,0.2879,0.75,0.0000,3,10,13,Sat,0
4,5,2011-01-01,1,0,1,4,0,6,0,1,0.24,0.2879,0.75,0.0000,0,1,1,Sat,0
5,6,2011-01-01,1,0,1,5,0,6,0,2,0.24,0.2576,0.75,0.0896,0,1,1,Sat,0
6,7,2011-01-01,1,0,1,6,0,6,0,1,0.22,0.2727,0.80,0.0000,2,0,2,Sat,0
7,8,2011-01-01,1,0,1,7,0,6,0,1,0.20,0.2576,0.86,0.0000,1,2,3,Sat,0
8,9,2011-01-01,1,0,1,8,0,6,0,1,0.24,0.2879,0.75,0.0000,1,7,8,Sat,0
9,10,2011-01-01,1,0,1,9,0,6,0,1,0.32,0.3485,0.76,0.0000,8,6,14,Sat,0


In [78]:
# remove some of the columns that do not appear to be useful
df.drop(columns=['instant', 'dteday', 'atemp', 'casual', 'registered', 'weekday'], inplace=True)
df.head()

Unnamed: 0,season,yr,mnth,hr,holiday,workingday,weathersit,temp,hum,windspeed,cnt,dayOfWeek,days
0,1,0,1,0,0,0,1,0.24,0.81,0.0,16,Sat,0
1,1,0,1,1,0,0,1,0.22,0.8,0.0,40,Sat,0
2,1,0,1,2,0,0,1,0.22,0.8,0.0,32,Sat,0
3,1,0,1,3,0,0,1,0.24,0.75,0.0,13,Sat,0
4,1,0,1,4,0,0,1,0.24,0.75,0.0,1,Sat,0


In [79]:
df.describe()

Unnamed: 0,season,yr,mnth,hr,holiday,workingday,weathersit,temp,hum,windspeed,cnt,days
count,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0,17379.0
mean,2.50164,0.502561,6.537775,11.546752,0.02877,0.682721,1.425283,0.496987,0.627229,0.190098,189.463088,361.562576
std,1.106918,0.500008,3.438776,6.914405,0.167165,0.465431,0.639357,0.192556,0.19293,0.12234,181.387599,209.042828
min,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.02,0.0,0.0,1.0,0.0
25%,2.0,0.0,4.0,6.0,0.0,0.0,1.0,0.34,0.48,0.1045,40.0,181.0
50%,3.0,1.0,7.0,12.0,0.0,1.0,1.0,0.5,0.63,0.194,142.0,362.0
75%,3.0,1.0,10.0,18.0,0.0,1.0,2.0,0.66,0.78,0.2537,281.0,543.0
max,4.0,1.0,12.0,23.0,1.0,1.0,4.0,1.0,1.0,0.8507,977.0,724.0


In [80]:
# normalize 
scaler = StandardScaler()
df['temp_scaled'] = scaler.fit_transform(df['temp'].values.reshape(-1,1))
df['hum_scaled'] = scaler.fit_transform(df['hum'].values.reshape(-1,1))
df['windspeed_scaled'] = scaler.fit_transform(df['windspeed'].values.reshape(-1,1))

df.head()

Unnamed: 0,season,yr,mnth,hr,holiday,workingday,weathersit,temp,hum,windspeed,cnt,dayOfWeek,days,temp_scaled,hum_scaled,windspeed_scaled
0,1,0,1,0,0,0,1,0.24,0.81,0.0,16,Sat,0,-1.334648,0.947372,-1.553889
1,1,0,1,1,0,0,1,0.22,0.8,0.0,40,Sat,0,-1.438516,0.895539,-1.553889
2,1,0,1,2,0,0,1,0.22,0.8,0.0,32,Sat,0,-1.438516,0.895539,-1.553889
3,1,0,1,3,0,0,1,0.24,0.75,0.0,13,Sat,0,-1.334648,0.63637,-1.553889
4,1,0,1,4,0,0,1,0.24,0.75,0.0,1,Sat,0,-1.334648,0.63637,-1.553889


In [81]:
X = df.drop(columns=['cnt','dayOfWeek'])
y = df['cnt']
print(X.shape)
print(y.shape)

(17379, 14)
(17379,)


In [82]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, test_size=0.3, random_state=123)

# Random Forest Regressor

In [83]:
model = RandomForestRegressor(n_estimators=100, oob_score=True, random_state=0, max_depth=32, min_samples_leaf=1)

In [84]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=32,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
           oob_score=True, random_state=0, verbose=0, warm_start=False)

In [85]:

predicted_train = model.predict(X_train)
y_predicted_test = model.predict(X_test)

test_score = r2_score(y_test, y_predicted_test)
spearman = spearmanr(y_test, y_predicted_test)
pearson = pearsonr(y_test, y_predicted_test)

print(f'Out-of-bag R-2 score estimate: {model.oob_score_:>5.3}')
print(f'Test data R-2 score: {test_score:>5.3}')
print(f'Test data Spearman correlation: {spearman[0]:.3}')
print(f'Test data Pearson correlation: {pearson[0]:.3}')

Out-of-bag R-2 score estimate:  0.94
Test data R-2 score: 0.945
Test data Spearman correlation: 0.974
Test data Pearson correlation: 0.972


In [86]:
model.score(X_test, y_test)

0.9448512912389737

In [87]:
print(X.columns)
print(model.feature_importances_)

Index(['season', 'yr', 'mnth', 'hr', 'holiday', 'workingday', 'weathersit',
       'temp', 'hum', 'windspeed', 'days', 'temp_scaled', 'hum_scaled',
       'windspeed_scaled'],
      dtype='object')
[0.02114194 0.08041103 0.01420437 0.60580258 0.00182017 0.03644419
 0.01537652 0.07116447 0.01410447 0.00533042 0.0506552  0.06381953
 0.01437006 0.00535507]


# Ridge Linear Regression Model

Compare the Decision Forest Regressor with the Ridge Linear Regression Model

In [104]:
lr_model = Ridge(alpha=0.001, random_state=123)

In [105]:
lr_model.fit(X_train, y_train)

Ridge(alpha=0.001, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=123, solver='auto', tol=0.001)

In [106]:
lr_predicted_train = lr_model.predict(X_train)
lr_y_predicted_test = lr_model.predict(X_test)

lr_test_score = r2_score(y_test, lr_y_predicted_test)
lr_spearman = spearmanr(y_test, lr_y_predicted_test)
lr_pearson = pearsonr(y_test, lr_y_predicted_test)

print(f'Test data R-2 score: {lr_test_score:>5.3}')
print(f'Test data Spearman correlation: {lr_spearman[0]:.3}')
print(f'Test data Pearson correlation: {lr_pearson[0]:.3}')

Test data R-2 score: 0.388
Test data Spearman correlation: 0.685
Test data Pearson correlation: 0.624


In [107]:
lr_model.score(X_test, y_test)

0.38822576602206327