# Class 7 - Solution Code

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns

%matplotlib inline

# Part 1: Mean Squared Error (MSE) Loss Function

API Docs for [sklearn.linear_model.LinearRegression](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

API Docs for [sklearn.metrics.mean_squared_error](http://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html)

### Create sample data and fit a model

In [None]:
# import scikit-learn linear model for Linear Regression
from sklearn import linear_model

# import scikit-learn metrics for MSE
from sklearn import metrics

# generate synthetic data
df = pd.DataFrame({'x': range(100), 'y': range(100)})

# generate biased copy
biased_df  = df.copy()
biased_df.loc[:20, 'y'] = 20

# add jitter to both
def append_jitter(series):
    jitter = np.random.random_sample(size=100)
    return series + jitter

df['x'] = append_jitter(df['x'])
df['y'] = append_jitter(df['y'])

biased_df['x'] = append_jitter(biased_df['x'])
biased_df['y'] = append_jitter(biased_df['y'])

In [None]:
## plot unbiased
sns.lmplot(x="x", y="y", data=df, fit_reg=False);

In [None]:
# fit unbiased
lm = linear_model.LinearRegression().fit(df[['x']], df['y'])
print metrics.mean_squared_error(df['y'], lm.predict(df[['x']]))

In [None]:
# plot biased
sns.lmplot(x="x", y="y", data=biased_df, fit_reg=False);

In [None]:
# fit biased
lm = linear_model.LinearRegression().fit(biased_df[['x']], biased_df['y'])
print metrics.mean_squared_error(df['y'], lm.predict(df[['x']]))

# Part 2: Cross Validation

API Docs for [sklearn.cross_validation.KFold](http://scikit-learn.org/stable/modules/generated/sklearn.cross_validation.KFold.html)

### Intro to cross validation with bike share data from last time. We will be modeling casual ridership. 

In [None]:
# import cross validation
from sklearn import cross_validation

# load bikeshare dataset
wd = '../../assets/dataset/bikeshare/'
bikeshare = pd.read_csv(wd + 'bikeshare.csv')

### Create dummy variables and set outcome (dependent) variable

In [None]:
# get dummy variables for weathersit
X = bikeshare[['temp', 'hum']].join(pd.get_dummies(bikeshare['weathersit'], prefix='weathersit'))

# set dependent variable
y = bikeshare['casual'] 

In [None]:
X.head()

### Create a cross valiation with 5 folds

In [None]:
kf = cross_validation.KFold(len(X), n_folds=5, shuffle=True)

In [None]:
mse_values = []
scores = []
n= 0
print "~~~~ CROSS VALIDATION each fold ~~~~"
for train_index, test_index in kf:
    lm = linear_model.LinearRegression().fit(X.iloc[train_index], y.iloc[train_index])
    mse_values.append(metrics.mean_squared_error(y.iloc[test_index], lm.predict(X.iloc[test_index])))
    scores.append(lm.score(X, y))
    n+=1
    print 'Model', n
    print 'MSE:', mse_values[n-1]
    print 'R2:', scores[n-1]


print "~~~~ SUMMARY OF CROSS VALIDATION ~~~~"
print 'Mean of MSE for all folds:', np.mean(mse_values)
print 'Mean of R2 for all folds:', np.mean(scores)

In [None]:
lm = linear_model.LinearRegression().fit(X, y)
print "~~~~ Single Model ~~~~"
print 'MSE of single model:', metrics.mean_squared_error(y, lm.predict(X))
print 'R2: ', lm.score(X, y)

### Check
While the cross validated approach here generated more overall error, which of the two approaches would predict new data more accurately: the single model or the cross validated, averaged one? Why?

**Answer**: 

# Part 3: Regularization

API Docs for [sklearn.linear_model.Lasso](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)

API Docs for [sklearn.linear_model.Ridge](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)

### Effects of Regularization on MSE and R2

In [None]:
# fit linear regression using no regularization (OLS)
lm = linear_model.LinearRegression().fit(X, y)
print "~~~ No regularization (OLS) ~~~"
print 'OLS MSE: ', metrics.mean_squared_error(y, lm.predict(X))
print 'OLS R2:', lm.score(X, y)

# fit linear regression using L1 regularization (Lasso)
lm = linear_model.Lasso().fit(X, y)
print "~~~ L1 regularization (Lasso) ~~~"
print 'Lasso MSE: ', metrics.mean_squared_error(y, lm.predict(X))
print 'Lasso R2:', lm.score(X, y)

# fit linear regression using L2 regularization (Ridge)
lm = linear_model.Ridge().fit(X, y)
print "~~~ L2 regularization (Ridge) ~~~"
print 'Ridge MSE: ', metrics.mean_squared_error(y, lm.predict(X))
print 'Ridge R2:', lm.score(X, y)

### Figuring out the alphas can be done by "hand"

In [None]:
alphas = np.logspace(-10, 10, 21)
for a in alphas:
    print 'Alpha:', a
    lm = linear_model.Ridge(alpha=a)
    lm.fit(X, y)
    print metrics.mean_squared_error(y, lm.predict(X))

# Part 4: Grid Search

API Docs for [sklearn.grid_search.GridSearchCV](http://scikit-learn.org/stable/modules/generated/sklearn.grid_search.GridSearchCV.html)

### Or we can use grid search to find alpha

In [None]:
# import grid search
from sklearn import grid_search

# pick range of values to search with
alphas = np.logspace(-10, 10, 21)

# use grid search CV to find best value
gs = grid_search.GridSearchCV(
    estimator=linear_model.Ridge(),
    param_grid={'alpha': alphas},
    scoring='mean_squared_error')
gs.fit(X, y)

### Get best estimator to use

In [None]:
print gs.best_estimator_

### Get all estimators and their corresponding performance

In [None]:
print gs.grid_scores_

# Part 5: Gradient Descent

API Docs for [sklearn.linear_model.SGDRegressor](http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html)

### Let's start off with our own implementation of the gradient descent algorithm

In [None]:
num_to_approach, start, steps, optimized = 6.2, 0., [-1, 1], False
while not optimized:
    current_distance = num_to_approach - start
    got_better = False
    next_steps = [start + i for i in steps]
    for n in next_steps:
        distance = np.abs(num_to_approach - n)
        if distance < current_distance:
            got_better = True
            print distance, 'is better than', current_distance
            current_distance = distance
            start = n
    if got_better:
        print 'found better solution! using', current_distance
        a += 1
    else:
        optimized = True
        print start, 'is closest to', num_to_approach


### Now let's add a stopping criteria to the algorithm

In [None]:
num_to_approach, start, steps, optimized = 6.2, 0., [-1, 1], False
n_iter = 0
while not optimized:
    if n_iter > 3:
        print 'stopping iterations'
        break
    n_iter += 1
    current_distance = num_to_approach - start
    got_better = False
    next_steps = [start + i for i in steps]
    for n in next_steps:
        distance = np.abs(num_to_approach - n)
        if distance < current_distance:
            got_better = True
            print distance, 'is better than', current_distance
            current_distance = distance
            start = n
    if got_better:
        print 'found better solution! using', current_distance
        a += 1
    else:
        optimized = True
        print start, 'is closest to', num_to_approach


### Finally, let's use the Stochastic Gradient Descent (SGD) class from scikit-learn

In [None]:
lm = linear_model.SGDRegressor()
lm.fit(X, y)
print "Gradient Descent MSE:", metrics.mean_squared_error(y, lm.predict(X))
print "Gradient Descent R2:", lm.score(X, y)

### Check:
Untuned, how well did SGD perform compared to OLS?

Previous Result for OLS (from above):
```
~~~~ SUMMARY OF CROSS VALIDATION ~~~~
Mean of MSE for all folds: 1780.97924083
Mean of R2 for all folds: 0.306643649561
```

**Answer**: 