In [None]:
### Section 6, Lecture 12

In [5]:
%matplotlib inline
import matplotlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [6]:
from sklearn import datasets, linear_model ## imports datasets from scikit-learn
from sklearn.model_selection import train_test_split

# from sklearn import model_selection
# from sklearn.tree import DecisionTreeRegressor  #decision tree regression
# from sklearn.metrics import accuracy_score

In [7]:
data = datasets.load_boston() ## loads Boston dataset from datasets library 

In [8]:
# define the data/predictors as pre-set feature names  
X = pd.DataFrame(data.data, columns=data.feature_names)

In [12]:
X.head(6)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21


In [9]:
# Put the target (housing value -- MEDV) in another DataFrame
Y = pd.DataFrame(data.target, columns=["MEDV"])

In [13]:
Y.head(6) # Continuous numerical variable

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
5,28.7


In [14]:
#split the data
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = .2, random_state=25) # Use 20% for testing.

### Random forest regression

In [15]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression

In [16]:
# instantiate decision tree model (depth = 2)
regressor = RandomForestRegressor(max_depth=2, random_state=0)

In [17]:
# Fit regression model onto these stuffs.
regressor.fit(X_train, y_train) # Training predictors, Target training values

  """Entry point for launching an IPython kernel.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=2,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [20]:
importances = regressor.feature_importances_ # RF based predictor variable importance
importances

array([0.        , 0.        , 0.        , 0.        , 0.00729892,
       0.70021159, 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.29248949])

In [22]:
indices = np.argsort(importances)[::-1] # Nobody's explained what this [::-1] means!

for f in range(X.shape[1]):
    print('%d. feature %d (%f)' % (f + 1, indices[f], importances[indices[f]]))


1. feature 5 (0.700212)
2. feature 12 (0.292489)
3. feature 4 (0.007299)
4. feature 11 (0.000000)
5. feature 10 (0.000000)
6. feature 9 (0.000000)
7. feature 8 (0.000000)
8. feature 7 (0.000000)
9. feature 6 (0.000000)
10. feature 3 (0.000000)
11. feature 2 (0.000000)
12. feature 1 (0.000000)
13. feature 0 (0.000000)


In [23]:
X.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [24]:
# predict Y values- 20% training data
Y_pred = regressor.predict(X_test)

In [25]:
Y_pred

array([15.36462224, 15.36462224, 23.01660873, 14.54563994, 16.25055599,
       23.01660873, 24.08692209, 14.54563994, 14.54563994, 41.65039063,
       14.54563994, 14.54563994, 17.13054181, 23.01660873, 25.0774782 ,
       23.01660873, 23.01660873, 14.54563994, 14.54563994, 23.01660873,
       23.01660873, 14.54563994, 23.01660873, 31.98945675, 23.01660873,
       23.01660873, 45.0534172 , 23.01660873, 23.01660873, 23.01660873,
       23.01660873, 24.08692209, 23.01660873, 23.01660873, 14.54563994,
       23.01660873, 23.01660873, 23.01660873, 30.06063136, 23.01660873,
       23.01660873, 23.01660873, 14.54563994, 16.25055599, 14.54563994,
       23.01660873, 14.54563994, 23.01660873, 31.98945675, 14.54563994,
       14.54563994, 15.36462224, 25.9094525 , 23.01660873, 23.01660873,
       14.54563994, 31.98945675, 24.08692209, 14.54563994, 24.08692209,
       14.54563994, 22.1699973 , 23.01660873, 23.01660873, 24.08692209,
       14.54563994, 23.01660873, 24.08692209, 23.01660873, 23.01

In [26]:
from sklearn.metrics import mean_squared_error

In [27]:
mean_squared_error(y_test, Y_pred) # Actual house prices (these weren't used for model building), Predicted values of house prices

33.03367144178237

In [28]:
from sklearn.metrics import r2_score

In [30]:
r2_score(y_test, Y_pred) # Tests the association between the actual prices and the prediced prices.

0.5156731310597823

In [31]:
X.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03


In [33]:
# Let's work with our predictors from index 5 and 12
# See section on indices/importances
X1=X[["RM", "LSTAT"]]
X1

Unnamed: 0,RM,LSTAT
0,6.575,4.98
1,6.421,9.14
2,7.185,4.03
3,6.998,2.94
4,7.147,5.33
5,6.430,5.21
6,6.012,12.43
7,6.172,19.15
8,5.631,29.93
9,6.004,17.10


In [34]:
Y = pd.DataFrame(data.target, columns=["MEDV"]) # Y = house price

In [35]:
X1_train, X1_test, y_train, y_test = train_test_split(X1, Y, test_size = .2, random_state=25) # Use 20% for testing.

In [38]:
Y1_pred = regressor.predict(X1_test) # ???  regressor.predict(X1_test, y_test)

ValueError: Number of features of the model must match the input. Model n_features is 13 and input n_features is 2 

In [39]:
mean_squared_error(y_test, Y1_pred)

NameError: name 'Y1_pred' is not defined

In [40]:
print(regressor.feature_importances_)

[0.         0.         0.         0.         0.00729892 0.70021159
 0.         0.         0.         0.         0.         0.
 0.29248949]
