# Importing the Boston dataset

In [1]:
import pandas as pd
import numpy as np

In [3]:
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


•	CRIM: This is the per-capita crime rate by town  
•	ZN: This is the proportion of residential land zoned for lots over 25,000 sq.ft  
•	INDUS: This is the proportion of non-retail business acres per town  
•	CHAS: The is the Charles River dummy variable (1 if the tract bounds river; 0 otherwise)    
•	NOX: This is the nitric oxide concentration (parts per 10 million)  
•	RM: This is the average number of rooms per dwelling  
•	AGE: This is the proportion of owner-occupied units built prior to 1940  
•	DIS: This is the weighted distance to five Boston employment centers    
•	RAD: This is the index of accessibility to radial highways    
•	TAX: This is the full-value property tax rate per 10,000  
•	PTRATIO: This is the pupil-teacher ratio by town  
•	B: 1000(Bk - 0.63)^2: Here, Bk is the proportion of blacks by town  
•	LSTAT: This is the % of lower status of the population  
•	MEDV: This is the median value of owner-occupied homes in $1000s  


# Preparation for fitting a regression tree

In [4]:
colnames=data.columns.values.tolist()
predictors=colnames[:13]
target=colnames[13]
X=data[predictors]
Y=data[target]

In [5]:
X.head(), Y.head()

(      crim    zn  indus  chas    nox     rm   age     dis  rad  tax  ptratio  \
 0  0.00632  18.0   2.31     0  0.538  6.575  65.2  4.0900    1  296     15.3   
 1  0.02731   0.0   7.07     0  0.469  6.421  78.9  4.9671    2  242     17.8   
 2  0.02729   0.0   7.07     0  0.469  7.185  61.1  4.9671    2  242     17.8   
 3  0.03237   0.0   2.18     0  0.458  6.998  45.8  6.0622    3  222     18.7   
 4  0.06905   0.0   2.18     0  0.458  7.147  54.2  6.0622    3  222     18.7   
 
     black  lstat  
 0  396.90   4.98  
 1  396.90   9.14  
 2  392.83   4.03  
 3  394.63   2.94  
 4  396.90   5.33  , 0    24.0
 1    21.6
 2    34.7
 3    33.4
 4    36.2
 Name: medv, dtype: float64)

# Creating and fitting a regression tree

In [6]:
from sklearn.tree import DecisionTreeRegressor
regression_tree = DecisionTreeRegressor(min_samples_split=30,min_samples_leaf=10,random_state=0)
regression_tree.fit(X,Y)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=10,
           min_samples_split=30, min_weight_fraction_leaf=0.0,
           presort=False, random_state=0, splitter='best')

# Comparing actual and the predicted values from the Regression Tree

In [7]:
reg_tree_pred=regression_tree.predict(data[predictors])
data['pred']=reg_tree_pred
cols=['pred','medv']
data[cols].head(10)

Unnamed: 0,pred,medv
0,22.84,24.0
1,22.84,21.6
2,35.247826,34.7
3,35.247826,33.4
4,35.247826,36.2
5,24.058621,28.7
6,20.811111,22.9
7,20.020833,27.1
8,20.020833,16.5
9,20.020833,18.9


# Cross Validating the Regression Tree

In [8]:
from sklearn.cross_validation import KFold
from sklearn.cross_validation import cross_val_score
crossvalidation = KFold(n=X.shape[0], n_folds=10,shuffle=True, random_state=1)
score = np.mean(cross_val_score(regression_tree, X, Y,scoring='neg_mean_squared_error', cv=crossvalidation,n_jobs=1))
score



-20.107307036443846

# Calculating the feature importance

In [9]:
regression_tree.feature_importances_

array([ 0.03421203,  0.        ,  0.00116059,  0.        ,  0.01856163,
        0.6308568 ,  0.01725115,  0.00137451,  0.        ,  0.00236983,
        0.00933325,  0.        ,  0.28488021])

# Creating and fitting a Random Forest

In [22]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_jobs=2,oob_score=True,n_estimators=10)
rf.fit(X,Y)

  warn("Some inputs do not have OOB scores. "


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
           oob_score=True, random_state=None, verbose=0, warm_start=False)

# Out of Bag prediction

In [23]:
rf.oob_prediction_

array([ 28.125     ,  22.625     ,  33.1       ,  34.625     ,
        35.72      ,  23.975     ,  18.975     ,  16.88      ,
        17.01666667,  22.8       ,  19.52      ,  20.83333333,
        21.875     ,  20.73333333,  18.44      ,  19.975     ,
        21.26666667,  20.26666667,  18.32      ,  19.16666667,
        14.125     ,  18.16      ,  18.7       ,  16.2       ,
        15.55      ,  14.96      ,  17.8       ,  14.75      ,
        18.76      ,  24.38333333,  13.83333333,  19.86666667,
        14.925     ,  13.76666667,  12.9       ,  21.45      ,
        20.67142857,  22.36666667,  20.74      ,  28.        ,
        30.5       ,  31.4       ,  24.7       ,  25.3       ,
        20.96666667,  21.68      ,  20.74      ,  19.96      ,
        18.83333333,  20.24      ,  21.92      ,  22.36666667,
        25.45      ,  21.13333333,  21.56666667,  33.875     ,
        23.14      ,  31.5       ,  23.03333333,  21.5       ,
        19.08      ,  18.94      ,  23.8       ,  26.92

# Comparing actual values with Random Forest predictions

In [24]:
data['rf_pred']=rf.oob_prediction_
cols=['rf_pred','medv']
data[cols].head()

Unnamed: 0,rf_pred,medv
0,28.125,24.0
1,22.625,21.6
2,33.1,34.7
3,34.625,33.4
4,35.72,36.2


# Calculating the squared mean error

In [25]:
data['rf_pred']=rf.oob_prediction_
data['err']=(data['rf_pred']-data['medv'])**2
sum(data['err'])/506

17.285697902018171

The mean squared error comes out to be 17.285, which is less than 20.10 obtained from the regression tree with cross-validation

# Calculating the oob score

In [26]:
rf.oob_score_

0.79524059721373308