In [None]:
import pandas as pd
import numpy as np


import pydotplus
from sklearn import tree
from sklearn import metrics
from sklearn import preprocessing
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score


from IPython import display
from matplotlib import cm
from matplotlib import gridspec
%matplotlib inline
from matplotlib import pyplot as plt
import seaborn as sns

pd.options.display.max_rows = 10
pd.options.display.float_format = '{:.1f}'.format

In [None]:
df_hour = pd.read_csv("data/hour.csv")

In [None]:
df_hour.rename(
    columns = {
     'instant' : "record_id",
     'dteday' : "datetime",
     'holiday' : "is_holiday",
     'workingday' : "is_workingday",
     'weathersit' : "weather_condition",
     'hum' : "humidity",
     'mnth' : "month",
     'cnt' : "total_count",
     'hr' : "hour",
     'yr' : "year"},
    inplace = True )

In [None]:
df_hour['datetime'] = pd.to_datetime(df_hour.datetime)

df_hour['season'] = df_hour.season.astype('category')
df_hour['is_holiday'] = df_hour.is_holiday.astype('category')
df_hour['weekday'] = df_hour.weekday.astype('category')
df_hour['weather_condition'] = df_hour.weather_condition.astype('category')
df_hour['is_workingday'] = df_hour.is_workingday.astype('category')
df_hour['month'] = df_hour.month.astype('category')
df_hour['year'] = df_hour.year.astype('category')
df_hour['hour'] = df_hour.hour.astype('category')

In [None]:
# Prepare for training samples
# Encode Categoricals (One Hot Encoding)
def fit_transform_one_hot_encoding(df, col_name):
    '''
    This function performs one hot encoding for the specified
        column.
    
    Args:
        df(pandas.DataFrame): the data frame containing the mentioned column name
        col_name: the column to be one hot encoded
    Returns:
        tuple: label_encoder, one_hot_encoder, transformed column as pandas Series      
    '''
    
    # label encode the column
    le = preprocessing.LabelEncoder()
    encoded_labels = le.fit_transform(df[col_name])
    df[col_name + '_label'] = encoded_labels
    
    # one hot encoding
    ohe = preprocessing.OneHotEncoder()
    feature_arr = ohe.fit_transform(df[[col_name+'_label']]).toarray()
    feature_labels = [col_name + '_' + str(cls_label) for cls_label in le.classes_]
    features_df = pd.DataFrame(feature_arr, columns = feature_labels)
    
    return le, ohe, features_df

In [None]:
# Prepare for the test samples
def transform_ohe(df,le,ohe,col_name):
    
    encoded_labels = le.fit_transform(df[col_name])
    df[col_name + '_label'] = encoded_labels
    
    feature_arr = ohe.fit_transform(df[[col_name+'_label']]).toarray()
    feature_labels = [col_name + '_' + str(cls_label) for cls_label in le.classes_]
    features_df = pd.DataFrame(feature_arr, columns = feature_labels)
    
    return features_df

In [None]:
df_hour = df_hour.reindex(np.random.permutation(df_hour.index))

In [None]:
df_hour.shape

In [None]:
17379*0.66
17379-11470

In [None]:
df_hour.describe()

In [None]:
df_hour.head()

In [None]:
def preprocess_features(df_hour):
    
    # 	longitude	latitude	housing_median_age	total_rooms	total_bedrooms	population	households	median_income	median_house_value
    selected_features = df_hour[[
 'record_id',
 'datetime',
 'season',
 'year',
 'month',
 'hour',
 'is_holiday',
 'weekday',
 'is_workingday',
 'weather_condition',
 'temp',
 'atemp',
 'humidity',
 'windspeed']]
    
    processed_features = selected_features.copy()
    #processed_features["rooms_per_person"] = california_housing_dataframe["total_rooms"] / california_housing_dataframe["population"]
    
    return processed_features

def preprocess_targets(df_hour):
    
    output_targets = pd.DataFrame()
    output_targets['total_count'] = df_hour['total_count']
    return output_targets



In [None]:
# Choose the first 11470 (out of 17379) examples for training.
training_examples = preprocess_features(df_hour.head(11470))
training_targets = preprocess_targets(df_hour.head(11470))

# Choose the last 5909 (out of 17379) examples for validation.
validation_examples = preprocess_features(df_hour.tail(5909))
validation_targets = preprocess_targets(df_hour.tail(5909))

print("Training set::{}{}".format(training_examples.shape, training_targets.shape))
print("Validation set::{}".format(validation_examples.shape))

In [None]:
training_examples.reset_index(inplace = True)
training_targets = training_targets.reset_index()

validation_examples.reset_index(inplace = True)
validation_targets = validation_targets.reset_index()

print("Training set::{}{}".format(training_examples.shape, training_targets.shape))
print("Validation set::{}".format(validation_examples.shape))

In [None]:
numeric_feature_cols = ['temp','humidity','windspeed',
                        'hour','weekday','month','year']
subset_cat_features =  ['season','is_holiday',
                        'weather_condition','is_workingday']
# cat_attr_list = ['season', 'is_holiday',
#                  'weather_condition','is_workingday',
#                  'hour','weekday',
#                  'month','year']

In [None]:
def create_training_input_fn(df,col_name):
    """This function performs one hot encoding for the specified
        column.
    Args:
        df(pandas.DataFrame): the data frame containing the mentioned column name
        col_name: the column to be one hot encoded
    Returns:
        tuple: label_encoder, one_hot_encoder, transformed column as pandas Series
    """
    # label-encode feature (integer encoded)
    label_encoders = preprocessing.LabelEncoder()
    integer_encoded = label_encoders.fit_transform(df[col_name])
    
    # make one hot encoding (binary encoded)
    onehot_encoder = preprocessing.OneHotEncoder(sparse=False)
    integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
    
    feature_arr = onehot_encoder.fit_transform(integer_encoded)
    feature_labels = [col_name+'_'+str(cls_label) for cls_label in label_encoders.classes_]
    feature_df = pd.DataFrame(feature_arr, columns=feature_labels)
    
    return label_encoders, onehot_encoder, feature_df

In [None]:
def create_predict_input_fn(df, col_name, label_encoders, onehot_encoder):
    
    # find the encoded label
    col_labels = label_encoders.transform(df[col_name])
    
    # make one hot encoding
    col_labels = col_labels.reshape(len(col_labels), 1)
    
    feature_arr = onehot_encoder.fit_transform(col_labels)
    feature_labels = [col_name+'_'+str(cls_label) for cls_label in label_encoders.classes_]
    feature_df = pd.DataFrame(feature_arr, columns=feature_labels)
    
    return feature_df

In [None]:
training_encoded_category_features = []

for col in subset_cat_features:
    return_obj = create_training_input_fn(training_examples, col)
    training_encoded_category_features.append({'label_enc':return_obj[0],
                                               'ohe_enc':return_obj[1],
                                               'feature_df':return_obj[2],
                                               'col_name':col})

In [None]:
feature_df_list = [training_examples[numeric_feature_cols]]
feature_df_list.extend([enc['feature_df'] for enc in training_encoded_category_features])

training_input_fn = pd.concat(feature_df_list, axis=1)
training_targets = training_targets.total_count.values.reshape(-1,1)

print("Shape::{}".format(training_input_fn.shape))
print("Shape::{}".format(training_targets.shape))

In [None]:
'''
Decision Tree based Regression
'''

In [None]:
dtr = DecisionTreeRegressor(max_depth = 4, 
                            min_samples_split = 5, 
                            max_leaf_nodes = 10)

dtr.fit(training_input_fn, training_targets)

In [None]:
dtr.score(training_input_fn, training_targets)

In [None]:
''' ********  '''
dot_data = tree.export_graphviz(dtr, out_file = None)
graph = pydotplus.graph_from_dot_data(dot_data)
graph.write_pdf("bikeshare.pdf")

In [None]:
## Fine-tuning the hyperparameters
# grid search or random search?

# For GridSearchCV()
# The dictionary basically provides a list of feasible values 
# for each of the hyperparameters that we want to fine-tune. 

param_grid = {"criterion": ['mse', 'mae'],
              "min_samples_split": [10, 20 ,40],
              "max_depth": [2, 6, 8],
              "min_samples_leaf": [20, 40, 100],
              "max_leaf_nodes": [5, 20, 100, 500, 800]}

grid_cv_dtr = GridSearchCV(dtr, param_grid, cv = 5)

In [None]:
# Reaaaaaally time-consuming
grid_cv_dtr.fit(training_input_fn, training_targets)

print("R-Squared::{}".format(grid_cv_dtr.best_score_))
print("Best Hyperparameters::\n{}".format(grid_cv_dtr.best_params_))

In [None]:
# Visualization 

In [None]:
df = pd.DataFrame(data=grid_cv_dtr.cv_results_)
df.head()

In [None]:
fig, ax = plt.subplots()

sns.pointplot(data=df[['mean_test_score',
                      'param_max_leaf_nodes',
                      'param_max_depth']],
              y='mean_test_score',
              x='param_max_depth',
              hue='param_max_leaf_nodes',
              ax=ax)

ax.set(title="Effect of Depth and Leaf Nodes on Model Performance")

In [None]:
''' Residual Plot '''

In [None]:
predicted = grid_cv_dtr.best_estimator_.predict(training_input_fn)
residuals = training_targets.flatten()-predicted

In [None]:
fig, ax = plt.subplots()
ax.scatter(training_targets.flatten(), residuals)
ax.axhline(lw=2,color='black')
ax.set_xlabel('Observed')
ax.set_ylabel('Residual')
plt.show()

In [None]:
r2_scores = cross_val_score(grid_cv_dtr.best_estimator_, 
                            training_input_fn, 
                            training_targets, 
                            cv=10)

mse_scores = cross_val_score(grid_cv_dtr.best_estimator_, 
                             training_input_fn, 
                             training_targets, 
                             cv=10,
                             scoring='neg_mean_squared_error')\

print("avg R-squared::{}".format(np.mean(r2_scores)))
print("MSE::{}".format(np.mean(mse_scores)))

In [None]:
'''
Testing the model
'''

In [None]:
best_dtr_model = grid_cv_dtr.best_estimator_

In [None]:
#create_predict_input_fn(df, col_name, label_encoders, onehot_encoder):

In [None]:
validation_encoded_category_features = []

for enc in training_encoded_category_features:
    col_name = enc['col_name']
    label_encoders = enc['label_enc']
    onehot_encoder = enc['ohe_enc']
    validation_encoded_category_features.append({'feature_df':create_predict_input_fn(validation_examples, 
                                                                                    col_name, 
                                                                                    label_encoders, 
                                                                                    onehot_encoder),
                                               'col_name':col_name})
    

In [None]:
predict_validation_input_fn = 


validation_feature_df_list = [validation_examples[numeric_feature_cols]]
validation_feature_df_list.extend([enc['feature_df'] for enc in validation_encoded_category_features])

predict_validation_input_fn = pd.concat(validation_feature_df_list, axis=1)
validation_targets = validation_targets.total_count.values.reshape(-1,1)

print("Shape::{}".format(predict_validation_input_fn.shape))


In [None]:
predictions = best_dtr_model.predict(predict_validation_input_fn)
residuals = validation_targets.flatten() - predictions

In [None]:
r2_score = best_dtr_model.score(predict_validation_input_fn, validation_targets)
print("R-squared::{}".format(r2_score))
print("MSE: %.2f"
      % metrics.mean_squared_error(validation_targets, predictions))

In [None]:
fig, ax = plt.subplots()
ax.scatter(validation_targets.flatten(), residuals)
ax.axhline(lw=2,color='black')
ax.set_xlabel('Observed')
ax.set_ylabel('Residual')
plt.show()

r2_score = grid_cv_dtr.best_estimator_.score(predict_validation_input_fn, validation_targets)