In [30]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
from sklearn.linear_model import LinearRegression
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import MultinomialNB

# Set pathes to data (downloaded from http://archive.ics.uci.edu/ml/machine-learning-databases/00275/)
data_path = '/Users/surabhigupta/Downloads/kaggle/'
daily_path = data_path + 'day.csv'
hourly_path = data_path + 'hour.csv'

In [31]:
# Load in data and select variables
daily_data = pd.read_csv(daily_path)
hourly_data = pd.read_csv(hourly_path)

daily_X = daily_data[['season','yr','mnth','holiday','weekday','workingday','weathersit','temp','atemp','hum','windspeed']]
daily_cnt, daily_casual, daily_registered = daily_data[['cnt']], daily_data[['casual']], daily_data[['registered']]

hourly_X = hourly_data[['season','yr','mnth','hr','holiday','weekday','workingday','weathersit','temp','atemp','hum','windspeed']]
hourly_cnt, hourly_casual, hourly_registered = hourly_data[['cnt']], hourly_data[['casual']], hourly_data[['registered']]

print 'daily X shape:', daily_X.shape
print 'daily y shape:', daily_cnt.shape
print 'hourly X shape:', hourly_X.shape
print 'hourly y shape:', hourly_cnt.shape

# Split data into training and testing data (80% train, 20% test)
daily_train_X, daily_train_cnt, daily_train_casual, daily_train_registered \
= daily_X[:585], daily_cnt[:585], daily_casual[:585],daily_registered[:585]
daily_test_X, daily_test_cnt, daily_test_casual, daily_test_registered \
= daily_X[-146:], daily_cnt[-146:], daily_casual[-146:],daily_registered[-146:]

hourly_train_X, hourly_train_cnt, hourly_train_casual, hourly_train_registered \
= hourly_X[:13904], hourly_cnt[:13904], hourly_casual[:13904], hourly_registered[:13904]
hourly_test_X, hourly_test_cnt, hourly_test_casual, hourly_test_registered \
= hourly_X[-3475:], hourly_cnt[-3475:], hourly_casual[-3475:], hourly_registered[-3475:]

# The training set is comprised of the first 19 days of each month, while the test set 
# is the 20th to the end of the month

print 'daily train X shape:', daily_train_X.shape
print 'daily test X shape:', daily_test_X.shape
print 'hourly train X shape:', hourly_train_X.shape
print 'hourly test X shape:', hourly_test_X.shape

daily X shape: (731, 11)
daily y shape: (731, 1)
hourly X shape: (17379, 12)
hourly y shape: (17379, 1)
daily train X shape: (585, 11)
daily test X shape: (146, 11)
hourly train X shape: (13904, 12)
hourly test X shape: (3475, 12)


In [47]:
# Fit linear regression model with training data
lr = LinearRegression(fit_intercept=True)

def getModelDescription(lr, data, actual, regressor_name, predictor):
    print "%s predicting %s count:" % (regressor_name, predictor)
    print 'Coefficients: \n', lr.coef_
    print ("Residual sum of squares: %.2f" % np.mean((lr.predict(data) - actual) ** 2))
    print ('R-squared: %.2f' % lr.score(data, actual))
    print '_____________________\n'


# Model 1: daily data predicting total count
lr.fit(daily_train_X, daily_train_cnt)
getModelDescription(lr, daily_test_X, daily_test_cnt, "Daily data", "Total")

# Model 2: daily data predicting casual count
lr.fit(daily_train_X, daily_train_casual)
getModelDescription(lr, daily_test_X, daily_test_casual, "Daily data", "Casual")

# Model 3: daily data predicting registered count
lr.fit(daily_train_X, daily_train_registered)
getModelDescription(lr, daily_test_X, daily_test_registered, "Daily data", "Registered")

# Model 4: hourly data predicting total count
lr.fit(hourly_train_X, hourly_train_cnt)
getModelDescription(lr, hourly_test_X, hourly_test_cnt, "Hourly data", "Total")

# Model 5: hourly data predicting casual count
lr.fit(hourly_train_X, hourly_train_casual)
getModelDescription(lr, hourly_test_X, hourly_test_casual, "Hourly data", "Casual")
hourly_test_casual

# Model 6: hourly data predicting registered count
lr.fit(hourly_train_X, hourly_train_registered)
getModelDescription(lr, hourly_test_X, hourly_test_registered, "Hourly data", "Registered")

# Model 7: using single variables to predict registered count
def fitSingleVariable(name):
    predictor = daily_data[[name]]
    train_X, train_cnt = predictor[:13904], daily_registered[:13904]
    test_X, test_registered = predictor[-3475:], daily_registered[-3475:]
    lr.fit(train_X, train_cnt)
    getModelDescription(lr, test_X, test_registered, name, "registered")

inputs = ['season','yr','mnth','holiday','weekday','workingday','weathersit','temp','atemp','hum','windspeed']
for predictor in inputs:
    fitSingleVariable(predictor)



Daily data predicting Total count:
Coefficients: 
[[  296.41065098  2029.31441602    27.86155032  -375.25815253
     45.78506018    70.46160976  -502.69904618  -641.66976487
   6309.16194357 -1010.35711168 -2243.59684665]]
Residual sum of squares: 1367297.58
R-squared: 0.61
_____________________

Daily data predicting Casual count:
Coefficients: 
[[   47.61800943   280.02577655   -11.01262247  -223.01487209
     18.11031724  -799.3919419    -99.15141536  -257.27046488
   2455.52900149  -377.54168809  -871.78962617]]
Residual sum of squares: 158317.20
R-squared: 0.68
_____________________

Daily data predicting Registered count:
Coefficients: 
[[  248.79264154  1749.28863947    38.87417279  -152.24328044
     27.67474294   869.85355165  -403.54763082  -384.39929999
   3853.63294208  -632.81542359 -1371.80722048]]
Residual sum of squares: 1026218.07
R-squared: 0.59
_____________________

Hourly data predicting Total count:
Coefficients: 
[[   8.25532284   76.79238374    2.50427247    7.2