Import the required modules

In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from datetime import datetime
from pprint import pprint
from time import time
import logging
from sklearn.model_selection import train_test_split
# SK-learn libraries for learning.
from sklearn.pipeline import Pipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

#SK-learn libraries for transformation and pre-processing
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Custom classes for this assignment
from kaggle import feature_engineering as fe




Load in our data

In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

targets = ['count', 'casual', 'registered']
predictors = [c for c in train_df.columns if c not in targets]

y_count = train_df[['count']]
y_casual = train_df[['casual']]
y_registered = train_df[['registered']]

X_train, X_dev, y_count_train, y_count_dev, y_casual_train, y_casual_dev, y_registered_train, y_registered_dev = train_test_split(train_df, y_count, y_casual, y_registered, random_state=2)


We can define our pipeline here

In [7]:
categorical = ('season', 'holiday', 'workingday', )
# datetime isn't numerical, but needs to be in the numeric branch
numerical = ('datetime', 'weather', 'temp', 'atemp', 'humidity', 'windspeed',)
pipeline = Pipeline([
    # process cat & num separately, then join back together
    ('union', FeatureUnion([ 
        ('categorical', Pipeline([
            ('select_cat', fe.SelectCols(cols = categorical)),
            ('onehot', OneHotEncoder()),    
        ])),    
        ('numerical', Pipeline([
            ('select_num', fe.SelectCols(cols = numerical)),
            ('date', fe.DateFormatter()),
            ('drop_datetime', fe.SelectCols(cols = ('datetime'), invert = True)),
            ('temp', fe.ProcessNumerical(cols_to_square = ('temp', 'atemp', 'humidity'))),
            # ('bad_weather', fe.BinarySplitter(col = 'weather', threshold = 2)),
            # ('filter', fe.PassFilter(col='atemp', lb = 15, replacement_style = 'mean'))
            ('scale', StandardScaler()),    
        ])),    
    ])),
    ('clf', RandomForestRegressor(n_estimators = 100)),
])

# parameters = {
#     'clf__n_estimators': (100,),
# }



In [8]:
def gs(y_train):
    grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1)
    print("Performing grid search...")
    print("pipeline:", [name for name, _ in pipeline.steps])
    print("parameters:")
    pprint(parameters)
    t0 = time()
    grid_search.fit(train_df[predictors].copy(), y_train)
    print("done in %0.3fs" % (time() - t0))
    print()


    print("Best score: %0.3f" % grid_search.best_score_)
    print("Best parameters set:")
    best_parameters = grid_search.best_estimator_.get_params()
    for param_name in sorted(parameters.keys()):
        print("\t%s: %r" % (param_name, best_parameters[param_name]))
    return grid_search

preds_count = gs(y_count).predict(test_df)
preds_casual = gs(y_casual).predict(test_df)
preds_registered = gs(y_registered).predict(test_df)

test_df.set_index(pd.DatetimeIndex(test_df['datetime']), inplace=True)
test_df['count'] = preds_count
test_df[['count']].to_csv('data/zi_count_preds.csv')

test_df['count'] = preds_casual + preds_registered
test_df[['count']].to_csv('data/zi_combined_preds.csv')




done in 188.048s
()
Best score: 0.792
Best parameters set:
	clf__n_estimators: 100


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.9min finished


Performing grid search...
('pipeline:', ['union', 'clf'])
parameters:
{'clf__n_estimators': (100,)}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
done in 164.813s
()
Best score: 0.715
Best parameters set:
	clf__n_estimators: 100


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.7min finished


Performing grid search...
('pipeline:', ['union', 'clf'])
parameters:
{'clf__n_estimators': (100,)}
Fitting 3 folds for each of 1 candidates, totalling 3 fits
done in 193.133s
()
Best score: 0.696
Best parameters set:
	clf__n_estimators: 100


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.0min finished
  self._final_estimator.fit(Xt, y, **fit_params)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X['date'] = pd.DatetimeIndex(X['datetime']).strftime("%Y%m%d")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  X['day'] = pd.DatetimeIndex(X['datetime']).strftime("%j")


Performing grid search...
('pipeline:', ['union', 'clf'])
parameters:
{'clf__n_estimators': (100,)}
Fitting 3 folds for each of 1 candidates, totalling 3 fits


In [0]:
count only: Your submission scored 0.48272
casual + registered: 0.47310
