# Guided Practice: Multiple Regression Analysis using citi bike data 

In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import linear_model, metrics

import statsmodels.api as sm

%matplotlib inline

bike_data = pd.read_csv('https://github.com/ga-students/DAT-NYC-37/raw/master/lessons/lesson-07/assets/dataset/bikeshare.csv')
bike_data.head(3)

Unnamed: 0,instant,dteday,season,yr,mnth,hr,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,casual,registered,cnt
0,1,2011-01-01,1,0,1,0,0,6,0,1,0.24,0.2879,0.81,0.0,3,13,16
1,2,2011-01-01,1,0,1,1,0,6,0,1,0.22,0.2727,0.8,0.0,8,32,40
2,3,2011-01-01,1,0,1,2,0,6,0,1,0.22,0.2727,0.8,0.0,5,27,32


In [131]:
from sklearn import feature_selection, linear_model

# From last class...
def get_linear_model_metrics(X, y):
    model = linear_model.LinearRegression()   # Specify the model
    pvals, fscores = feature_selection.f_regression(X, y)  # Defining the model
    # get the pvalue of X given y. Ignore f-stat for now.
    
    # start with an empty linear regression object
    # .fit() runs the linear regression function on X and y
    model.fit(X, y)

    residuals = (y - model.predict(X)).values

    # print the necessary values
    # *TODO*: Describe in your words
    print 'F Scores:', fscores
    print 'Coefficients:', model.coef_
    print 'y-intercept:', model.intercept_
    print 'R-Squared:', model.score(X,y)
    print
    
    # keep the model
    return model

# Building a model to predict guest ridership
With a partner, complete this code together and visualize the correlations of all the numerical features built into the data set.

We want to:
- Id categorical variables
- Create dummies (weather situation is done for you in the starter code)
- Find at least two more features that are not correlated with current features, but could be strong indicators for predicting guest riders.

In [132]:
#create new names for our new dummy variables
weather = pd.get_dummies(bike_data.weathersit)
weather.columns = ['weather_' + str(i) for i in weather.columns]

season = pd.get_dummies(bike_data.season)
season.columns = ['season_' + str(i) for i in season.columns]

weekday = pd.get_dummies(bike_data.weekday)
weekday.columns = ['weekday_' + str(i) for i in weekday.columns]

hr = pd.get_dummies(bike_data.hr)
hr.columns = ['hr_' + str(i) for i in hr.columns]

month = pd.get_dummies(bike_data.mnth)
month.columns = ['month_' + str(i) for i in month.columns]

#join those new variables back into the larger dataset
bikemodel_data = pd.concat([bike_data, weather, season, month, hr, weekday], axis=1)

In [133]:
# -- my code starts here ---

y = bike_data['casual']
X = bikemodel_data.drop([
        'instant',   # This is a date column, and is neither categorical nor numerical
        'dteday',   # This is a date category
        'weather_4',  # (We discussed this in class)
        'season_4',  # (We discussed this in class)
        'weekday_6',  # (We discussed this in class)
        'month_11',  # (We discussed this in class)
        'hr_0',  # (We discussed this in class)
        'weathersit', 
        'season', 
        'weekday',
        'workingday',
        'hr',
        'mnth',
        'casual',
        'registered', 
        'temp', 
        'cnt'
    ], axis=1) # My features

# We've only kept these columns
print X.columns

# The baseline. 
get_linear_model_metrics(X, y)

Index([u'yr', u'holiday', u'atemp', u'hum', u'windspeed', u'weather_1',
       u'weather_2', u'weather_3', u'season_1', u'season_2', u'season_3',
       u'month_1', u'month_2', u'month_3', u'month_4', u'month_5', u'month_6',
       u'month_7', u'month_8', u'month_9', u'month_10', u'month_12', u'hr_1',
       u'hr_2', u'hr_3', u'hr_4', u'hr_5', u'hr_6', u'hr_7', u'hr_8', u'hr_9',
       u'hr_10', u'hr_11', u'hr_12', u'hr_13', u'hr_14', u'hr_15', u'hr_16',
       u'hr_17', u'hr_18', u'hr_19', u'hr_20', u'hr_21', u'hr_22', u'hr_23',
       u'weekday_0', u'weekday_1', u'weekday_2', u'weekday_3', u'weekday_4',
       u'weekday_5', u'weekday_6'],
      dtype='object')
F Scores: [  8.09908774e-080   3.15814032e-005   0.00000000e+000   0.00000000e+000
   8.66781628e-033   3.75616929e-073   3.43170021e-022   1.57718666e-055
   7.90819136e-239   1.70941845e-060   1.20544330e-119   7.20393520e-107
   6.04893370e-081   7.48894210e-006   9.89721401e-008   2.10345324e-034
   2.02298294e-036   1.8652

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [None]:
f_r


## Groups of 3: Building model to predict guest ridership 


#### Pay attention to:
* Which variables would make sense to dummy (because they are categorical, not continuous)? 
* the distribution of riders (should we rescale the data?)  
* checking correlations with variables and guest riders  
* having a feature space (our matrix) with low multicollinearity  
* the linear assumption -- given all feature values being 0, should we have no ridership? negative ridership? positive ridership?
* What features might explain ridership but aren't included in the data set? 

### You're done when:  
If your model has an r-squared above .4, this a relatively effective model for the data available. Kudos! Move on to the bonus!

In [None]:
# your code here...

bike_data.columns

In [None]:
# and here

In [None]:
# add as many cells as you need :) 

#### 1: What's the strongest predictor? 

Answer:

#### 2: How well did your model do? 

Answer:

#### 3: How can you improve it? 

Answer:

### Bonus:
    
We've completed a model that explains casual guest riders. Now it's your turn to build another model, using a different y (outcome) variable: registered riders.

**Bonus 1:** What's the strongest predictor? 

**Bonus 2:** How well did your model do? 

**Bonus 3:** How can you improve it? 

### Additional Resources:

- Good explanation of when to apply log scaling: http://stats.stackexchange.com/a/28007