# Boston Housing Dataset

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()

### Overview

In [None]:
boston.keys()

In [None]:
boston.data.shape

In [None]:
print(boston.DESCR)

### Creating the DF

In [None]:
bos = pd.DataFrame(boston.data, 
                   columns=boston.feature_names)

In [None]:
bos['PRICE'] = boston.target

In [None]:
bos.head()

### Modeling

In [None]:
from sklearn.linear_model import LinearRegression

# remove target, otherwise it will be used for modelling !
X = bos.drop('PRICE', axis = 1)
y = bos['PRICE']

lm = LinearRegression()

lm.fit() -> fits a linear model

lm.predict() -> Predict Y using the linear model with estimated coefficients

lm.score() -> Returns the coefficient of determination (R^2). A measure of how well observed outcomes are replicated by the model, as the proportion of total variation of outcomes explained by the model. 

.coef_  gives the coefficients and .intercept_  gives the estimated intercepts.

In [None]:
lm.fit(X, y)

In [None]:
print('intercept: %s' % lm.intercept_)

In [None]:
pd.Series(lm.coef_, X.columns)

In [None]:
plt.scatter(y, lm.predict(X))
plt.plot([0,50], [0,50], color='red')

### Validation

#### Split

In [None]:
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
X_train.head()

In [None]:
X_train.shape

In [None]:
lm = LinearRegression()
lm.fit(X_train, y_train)

In [None]:
pred = lm.predict(X_test)

In [None]:
pred

In [None]:
plt.scatter(y_test, pred)
plt.plot([0,50], [0,50], color='red')

In [None]:
# Exercise 1:

# Explore the prediction vector

# 1. Build a 2 histograms: 
# - real prices (y_test)
# - predicted prices

# 2. Do you see something strange in the result? Propose a solution

#### Metric

In [None]:
from sklearn.metrics import mean_squared_error

mse_error = mean_squared_error(y_test, pred) # erreur quadratique moyenne
mse_error

In [None]:
((y_test-pred) ** 2).abs().mean()

In [None]:
# Exercise 2:

# Calculate mean absolute error by formula and by sklearn function 

In [None]:
from sklearn.metrics import mean_absolute_error

# ...

In [None]:
# Exercise 3:

# Compare results with models Lasso and Ridge:

# 1. Train Lasso and Ridge models with default alpha. What mean absolute error do you calculate?

# 2. Train Lasso and Ridge with difference alphas = 0.1, 1, 10, 100 (hist: use for loop). 
#    Which model gives the best (lowest) MAE?

In [None]:
from sklearn.linear_model import Lasso
model = Lasso(alpha=0.1)

from sklearn.linear_model import Ridge
model = Ridge(alpha=1.0)

# ...

### Cross-Validation

In [None]:
from sklearn import grid_search

parameters = {'alpha':[0.1, 1.0, 10.0]}

model = Lasso() # we don't specify alpha directly, we'll specify multiple alphas to try below

gs = grid_search.GridSearchCV(
    model, 
    parameters,
    cv=5,
    verbose=1,
    scoring='mean_absolute_error' 
    # full list of metrics 
    # http://scikit-learn.org/stable/modules/model_evaluation.html
)
gs.fit(X, y)

In [None]:
gs.best_estimator_

In [None]:
gs.best_score_

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

model = DecisionTreeRegressor()

parameters = {
    'max_depth':[5, 10, None],
    'min_samples_split': [2, 10]
    }

gs = grid_search.GridSearchCV(
    model, 
    parameters,
    cv=5,
    verbose=1,
    scoring='mean_absolute_error' 
)
gs.fit(X, y)

In [None]:
gs.best_estimator_

In [None]:
gs.best_score_

In [None]:
# LAB:

# Train in a similar manner a :

#     Random Forest (try changing parameters max_depth, min_samples_split, max_features)
#     http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html
        
#     Gradient Boosting (try changing parameters max_depth, learning_rate, max_features)
#     http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html