# Final Project

In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from datetime import datetime


#SK-learn libraries for setup
from sklearn.model_selection import train_test_split

# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# SK-learn libraries for evaluation.
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.metrics import classification_report


In [2]:
#Read data
os.chdir("/Users/nwchen24/Desktop/UC_Berkeley/machine_learning/final_project_github_repo/w207_kaggleproject/data/")

train_data_full = pd.read_csv('train.csv')
train_data_full.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count
0,2011-01-01 00:00:00,1,0,0,1,9.84,14.395,81,0.0,3,13,16
1,2011-01-01 01:00:00,1,0,0,1,9.02,13.635,80,0.0,8,32,40
2,2011-01-01 02:00:00,1,0,0,1,9.02,13.635,80,0.0,5,27,32
3,2011-01-01 03:00:00,1,0,0,1,9.84,14.395,75,0.0,3,10,13
4,2011-01-01 04:00:00,1,0,0,1,9.84,14.395,75,0.0,0,1,1


## Prep

In [3]:
#Create a program age variable
#Do this before we do the train test split because the beginning of the program is a constant, so we don't
#have to worry about 'contaminating' any of our data by applying the same transformation to the entire dataset

#Convert to datetime object
train_data_full['datetime'] = train_data_full['datetime'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

#Initialized program beginning
prog_beginning = '2011-01-01 00:00:00'

prog_beginning_conv = datetime.strptime(prog_beginning, '%Y-%m-%d %H:%M:%S')

#Create a program age variable
train_data_full['prog_age'] = train_data_full['datetime'] - prog_beginning_conv

#convert the time delta to float of days
train_data_full['prog_age'] = train_data_full['prog_age'].dt.total_seconds() / (24 * 60 * 60)



In [4]:
#Split into train and development
train_data, dev_data = train_test_split(train_data_full, random_state = 0)

In [5]:
#Add a string desctriptor for weather
weather_values = [1,2,3,4]
weather_descriptions = ['Nice Weather', 'OK Weather', 'Poor Weather', 'Bad Weather']
weather_dict_df = pd.DataFrame({'weather' : weather_values, 'weather_descriptions': weather_descriptions})

weather_dict_df
train_data = pd.merge(train_data, weather_dict_df, how = 'left')

#Add a string descriptor for season
season_values = [1,2,3,4]
season_str = ['Spring', 'Summer', 'Fall', 'Winter']
season_dict_df = pd.DataFrame({'season' : season_values, 'season_str': season_str})

train_data = pd.merge(train_data, season_dict_df, how = 'left')

train_data.head()

Unnamed: 0,datetime,season,holiday,workingday,weather,temp,atemp,humidity,windspeed,casual,registered,count,prog_age,weather_descriptions,season_str
0,2012-12-09 17:00:00,4,0,0,3,14.76,17.425,93,8.9981,20,209,229,708.708333,Poor Weather,Winter
1,2011-06-19 00:00:00,2,0,0,1,28.7,32.575,65,0.0,18,71,89,169.0,Nice Weather,Summer
2,2012-05-10 19:00:00,2,0,1,1,22.14,25.76,37,23.9994,84,469,553,495.791667,Nice Weather,Summer
3,2011-12-06 08:00:00,4,0,1,2,18.86,22.725,94,12.998,13,401,414,339.333333,OK Weather,Winter
4,2011-04-17 08:00:00,2,0,0,1,15.58,19.695,46,26.0027,7,36,43,106.333333,Nice Weather,Summer


In [6]:
#Helper function to calculate root mean squared error
def get_RMSE(actual_values, predicted_values):
    n = len(actual_values)
    RMSE = np.sqrt(np.sum(((np.log(predicted_values + 1) - np.log(actual_values + 1)) ** 2) / n))
    return RMSE

    

## Feature Engineering

## Model  

Try fitting separate models for casual and registered, then add the results to get count

In [7]:
#set list of predictors
predictors = ['season', 'holiday', 'workingday', 'weather', 'temp', 'humidity', 'windspeed', 'prog_age']

casual = 'casual'

registered = 'registered'



### Decision Tree Regressor

In [8]:
#Fit the classifiers

#First, casual rides
#intsantiate the classifier
#Set max depth
dt_casual = DecisionTreeRegressor(max_depth = 3)

#fit the classifier for casual rentals
dt_casual.fit(train_data[predictors], train_data[casual])

#The registered rides
#intsantiate the classifier
#Set max depth
dt_registered = DecisionTreeRegressor(max_depth = 3)

#fit the classifier for casual rentals
dt_registered.fit(train_data[predictors], train_data[registered])



DecisionTreeRegressor(criterion='mse', max_depth=3, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [191]:
#get RMSE
#predict casual and registered rides
preds_casual = dt_casual.predict(train_data[predictors])
preds_registered = dt_registered.predict(train_data[predictors])

#combined predictions
preds_count = preds_casual + preds_registered

actual = train_data['count']

RMSE = get_RMSE(actual_values = actual, predicted_values = preds_count)

print "Root Mean Squared Error: " + str(RMSE)

Root Mean Squared Error: 1.38356940276


In [192]:
#Try on the dev data
dev_preds_casual = dt_casual.predict(dev_data[predictors])
dev_preds_registered = dt_registered.predict(dev_data[predictors])

#combined predictions
dev_preds_count = dev_preds_casual + dev_preds_registered

actual = dev_data['count']

RMSE = get_RMSE(actual_values = actual, predicted_values = dev_preds_count)

print "Root Mean Squared Error: " + str(RMSE)


Root Mean Squared Error: 1.38714682064


### Random Forest Regressor

In [None]:
rf