In [1]:
## Section 6, Lecture 12

In [2]:
%matplotlib inline
from warnings import simplefilter

import numpy as np
import pandas as pd
from sklearn import datasets  # imports datasets from scikit-learn
from sklearn.model_selection import train_test_split

simplefilter(action='ignore', category=FutureWarning)

In [3]:
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

data = datasets.load_boston() # loads Boston dataset from datasets library 


In [4]:
# define the data/predictors as pre-set feature names  
X = pd.DataFrame(data.data, columns=data.feature_names)

X.head(6)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21


In [5]:
# Put the target (housing value -- MEDV) in another DataFrame
Y = pd.DataFrame(data.target, columns=["MEDV"])

# Continuous numerical variable
Y.head(6)

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
5,28.7


In [6]:
# split the data
# Use 20% for testing.
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=20)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((404, 13), (102, 13), (404, 1), (102, 1))

## Random forest regression

In [7]:
from sklearn.ensemble import RandomForestRegressor

# instantiate decision tree model (depth = 2)
regressor = RandomForestRegressor(max_depth=2, random_state=0)

In [8]:
# Fit regression model onto these stuffs.

# Training predictors, Target training values
regressor.fit(X_train, y_train)

  regressor.fit(X_train, y_train)


RandomForestRegressor(max_depth=2, random_state=0)

In [9]:
importance = regressor.feature_importances_  # RF based predictor variable importance

importance

array([0.00431886, 0.        , 0.        , 0.        , 0.00521366,
       0.46593122, 0.        , 0.00202324, 0.        , 0.        ,
       0.        , 0.        , 0.52251302])

## Python sorting with [::-1]

```python
indices = np.argsort(importance)[::-1]
```

In Python, the `[::-1]` syntax is used to reverse a sequence, such as a list, tuple, or NumPy array. 

The `[::-1]` part is used to sort the `importance` array in descending order and then return the indices that would sort the array in that order. This means that `indices` will contain the indices of the elements in `importance` sorted in descending order of importance.

For example, if `importance` is the array `[0.2, 0.8, 0.5, 0.3]`, then `np.argsort(importance)` would return `[0, 3, 2, 1]`, which are the indices that would sort the array in ascending order.

Adding `[::-1]` at the end would reverse this result to `[1, 2, 3, 0]`, which are the indices that would sort the array in descending order. Therefore, `indices` will be `[1, 2, 3, 0]`.

In [10]:
indices = np.argsort(importance)[::-1]  # See above explanation.

for f in range(X.shape[1]):
    print('%d. feature %d (%f)' % (f + 1, indices[f], importance[indices[f]]))


1. feature 12 (0.522513)
2. feature 5 (0.465931)
3. feature 4 (0.005214)
4. feature 0 (0.004319)
5. feature 7 (0.002023)
6. feature 11 (0.000000)
7. feature 10 (0.000000)
8. feature 9 (0.000000)
9. feature 8 (0.000000)
10. feature 6 (0.000000)
11. feature 3 (0.000000)
12. feature 2 (0.000000)
13. feature 1 (0.000000)


In [11]:
X.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [12]:
# predict Y values- 20% training data
Y_pred = regressor.predict(X_test)

Y_pred

array([21.23192259, 21.96976697, 20.43144265, 14.99328599, 14.81346832,
       25.00359162, 14.99328599, 14.10101839, 21.2364171 , 21.96976697,
       24.2325916 , 15.47975248, 25.77326956, 21.90407318, 22.90818122,
       25.77326956, 30.57431136, 15.01986941, 15.23500956, 18.71339913,
       44.27400035, 44.27400035, 24.83404009, 30.23740694, 21.2364171 ,
       25.77326956, 21.96976697, 25.77326956, 21.28739223, 25.77326956,
       20.45523696, 20.96582113, 20.35739514, 14.10101839, 25.77326956,
       21.96976697, 42.56928924, 25.77326956, 21.96976697, 32.79133163,
       14.46013135, 14.80404281, 22.83779739, 15.75162745, 24.77369793,
       23.80327233, 24.2325916 , 25.77326956, 14.17858175, 15.20875869,
       43.68071498, 31.5689634 , 14.17858175, 41.32758192, 14.99328599,
       21.61065402, 16.48012118, 18.17902065, 14.72650095, 18.71339913,
       25.85365319, 21.34804814, 21.28739223, 23.80327233, 16.28600593,
       15.27696858, 25.00359162, 25.23684492, 20.3263679 , 14.52

In [13]:
from sklearn.metrics import mean_squared_error

# Actual house prices (these weren't used for model building), Predicted values of house prices
mean_squared_error(y_test, Y_pred)

19.663880229161606

In [14]:
from sklearn.metrics import r2_score

# Tests the association between the actual prices and the predicted prices.
r2_score(y_test, Y_pred)

0.6946860227365617

In [15]:
X.head(3)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03


In [16]:
# Let's work with our predictors from index 5 and 12
# See section on indices/importance
X1 = X[["RM", "LSTAT"]]
X1.head()

Unnamed: 0,RM,LSTAT
0,6.575,4.98
1,6.421,9.14
2,7.185,4.03
3,6.998,2.94
4,7.147,5.33


In [17]:
Y = pd.DataFrame(data.target, columns=["MEDV"]) # Y = house price

In [21]:
# Use 20% for testing.
X1_train, X1_test, y_train, y_test = train_test_split(X1, Y, test_size = .2, random_state=20)

X1_train.shape, X1_test.shape, y_train.shape, y_test.shape


((404, 2), (102, 2), (404, 1), (102, 1))

In [23]:
# RandomForestRegressor
# Y1_pred = regressor.predict(X1_test)

# mean_squared_error(y_test, Y1_pred)

# print(regressor.feature_importances_)
