In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
%matplotlib inline

Regression
============

Load the boston dataset:

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()
boston.keys()

dict_keys(['target', 'data', 'DESCR', 'feature_names'])

In [3]:
# insight of dataset, to predict housing price
print(boston.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [4]:
#data is numpy array
boston.data.shape

(506, 13)

In [5]:
pd.DataFrame(boston.data, columns =boston.feature_names).head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98
1,0.02731,0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14
2,0.02729,0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
3,0.03237,0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
4,0.06905,0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33


In [6]:
boston.target.shape

(506,)

In [7]:
# thousands of dollars
boston.target

array([ 24. ,  21.6,  34.7,  33.4,  36.2,  28.7,  22.9,  27.1,  16.5,
        18.9,  15. ,  18.9,  21.7,  20.4,  18.2,  19.9,  23.1,  17.5,
        20.2,  18.2,  13.6,  19.6,  15.2,  14.5,  15.6,  13.9,  16.6,
        14.8,  18.4,  21. ,  12.7,  14.5,  13.2,  13.1,  13.5,  18.9,
        20. ,  21. ,  24.7,  30.8,  34.9,  26.6,  25.3,  24.7,  21.2,
        19.3,  20. ,  16.6,  14.4,  19.4,  19.7,  20.5,  25. ,  23.4,
        18.9,  35.4,  24.7,  31.6,  23.3,  19.6,  18.7,  16. ,  22.2,
        25. ,  33. ,  23.5,  19.4,  22. ,  17.4,  20.9,  24.2,  21.7,
        22.8,  23.4,  24.1,  21.4,  20. ,  20.8,  21.2,  20.3,  28. ,
        23.9,  24.8,  22.9,  23.9,  26.6,  22.5,  22.2,  23.6,  28.7,
        22.6,  22. ,  22.9,  25. ,  20.6,  28.4,  21.4,  38.7,  43.8,
        33.2,  27.5,  26.5,  18.6,  19.3,  20.1,  19.5,  19.5,  20.4,
        19.8,  19.4,  21.7,  22.8,  18.8,  18.7,  18.5,  18.3,  21.2,
        19.2,  20.4,  19.3,  22. ,  20.3,  20.5,  17.3,  18.8,  21.4,
        15.7,  16.2,

split into train/test with boston dataset:

In [8]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(boston.data, boston.target)

Learning a Regressor
===========

In [9]:
from sklearn.linear_model import Ridge
# import model Ridge for regression

In [10]:
# init
ridge = Ridge()

In [11]:
ridge.fit(X_train, y_train)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, solver='auto', tol=0.001)

In [12]:
pred_test = ridge.predict(X_test)
pred_test

array([ 15.54255867,  15.07990805,  34.01119137,  17.74422693,
        29.44660576,  32.67834015,  30.04300149,  34.46952419,
        20.98142504,  26.535641  ,   0.52775876,  23.44217677,
        16.03572595,  12.40144058,  32.55156155,  18.81551844,
        23.76838788,  15.79548077,  34.62498085,  19.80478224,
        18.07682707,  27.18774641,  27.68234217,   9.32506419,
        27.90661679,  19.42253956,  39.50039619,  10.25054104,
        23.35621039,  18.27136071,  14.62113189,  30.9616305 ,
        22.83060258,  19.99407541,  35.70380863,  17.76394298,
        18.28996867,  20.51406104,  32.11949069,  20.1932633 ,
        24.02978985,  22.20821977,  13.44066084,  22.72787111,
        20.23069111,  25.04882122,  19.81569963,  16.11541804,
        26.90342562,  24.65984656,  27.99612079,  33.73372205,
        25.06321991,  23.89624707,  25.37249022,  35.85733232,
        27.02195335,  20.53583912,  20.462544  ,  15.52779694,
        30.62672481,  21.53888717,  21.57294993,  27.47

Evaluate result 
(1) R2 score:

In [13]:
ridge.score(X_test, y_test)

0.75872508757364976

(2) MSE: from sklearn.metrics import mean_squared_error

In [14]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, pred_test)

20.557852174647998

Random Forest Regression
----------------------------

In [15]:
from sklearn.ensemble import RandomForestRegressor

In [16]:
rf = RandomForestRegressor()

In [17]:
rf.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [18]:
rf.score(X_test, y_test)

0.90473178087614325

In [19]:
mean_squared_error(y_test, rf.predict(X_test))

8.1173377952755885