In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from datetime import datetime
from pprint import pprint
from time import time
import logging
from sklearn.model_selection import train_test_split
# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import preprocessing

#SK-learn libraries for transformation and pre-processing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Custom classes for this assignment
import feature_engineering as fe

In [2]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [3]:
# Define pipeline
categorical = ('season', 'holiday', 'workingday', )
# datetime isn't numerical, but needs to be in the numeric branch
numerical = ('datetime', 'weather', 'temp', 'atemp', 'humidity', 'windspeed',)
pipeline = Pipeline([
    # process cat & num separately, then join back together
    ('union', FeatureUnion([ 
        ('categorical', Pipeline([
            ('select_cat', fe.SelectCols(cols = categorical)),
            ('onehot', OneHotEncoder()),    
        ])),    
        ('numerical', Pipeline([
            ('select_num', fe.SelectCols(cols = numerical)),
            ('date', fe.DateFormatter()),
            ('drop_datetime', fe.SelectCols(cols = ('datetime'), invert = True)),
            ('temp', fe.ProcessNumerical(cols_to_square = ('temp', 'atemp', 'humidity'))),
            # ('bad_weather', fe.BinarySplitter(col = 'weather', threshold = 2)),
            # ('filter', fe.PassFilter(col='atemp', lb = 15, replacement_style = 'mean'))
            ('scale', StandardScaler()),    
        ])),    
    ])),
    ('clf', RandomForestRegressor(n_estimators = 100,
                                  oob_score='TRUE',n_jobs = -1,random_state =50,)),
#     ('clf', RandomForestRegressor(n_estimators=100,
#                                   oob_score='TRUE',
#                                   n_jobs = -1,
#                                   random_state =50,
#                                   max_features = "auto",
#                                   min_samples_leaf = 50))
])

#Helper function to calculate root mean squared error
def get_RMSE(actual_values, predicted_values):
    n = len(actual_values)
    RMSE = np.sqrt(np.sum(((np.log(predicted_values + 1) - np.log(actual_values + 1)) ** 2) / n))
    return RMSE

def train_dev_model_search(registered_or_casual,parameters):
    print("Performing grid search...")
    t0 = time()
    gs = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1, scoring='neg_mean_squared_error')
    features = [c for c in train_df.columns if c not in ['count', 'casual', 'registered']]    
    gs.fit(train_data[features], train_data[registered_or_casual])
    print("Best parameters set:")
    best_param = gs.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_param[param_name]))
    predicted_y = gs.predict(dev_data[features])
    rmse = get_RMSE(actual_values = dev_data[registered_or_casual], predicted_values = predicted_y)
    print "RMSE: ", str(rmse)
    print("Done in %0.3fs" % (time() - t0))
    print ""

In [4]:
train_data = train_df[pd.DatetimeIndex(train_df['datetime']).day <= 16]
dev_data = train_df[pd.DatetimeIndex(train_df['datetime']).day > 16]
	
# Test for casual and registered separately
parameters = {
    'clf__n_estimators': (100,110,),
#     'clf__learning_rate': (0.05,),
#     'clf__max_depth': (10,),
#     'clf__min_samples_leaf': (20,),
}

print "Casual rides"
train_dev_model_search('casual',parameters)

#     'clf__n_estimators': (80,),
print "Registered rides"
train_dev_model_search('registered',parameters)

Casual rides
Performing grid search...
Fitting 3 folds for each of 2 candidates, totalling 6 fits


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X['date'] = pd.DatetimeIndex(X['datetime']).strftime("%Y%m%d")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X['day'] = pd.DatetimeIndex(X['datetime']).strftime("%j")


KeyboardInterrupt: 