In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
from sklearn.ensemble import RandomForestRegressor
import matplotlib.pyplot as plt

In [3]:
model = RandomForestRegressor(criterion='mse',
                                 max_depth=12, # глубина дерева  
                                 min_samples_leaf=30, # минимальное кол-во наблюдений в листе дерева
                                 random_state=42,
                                 n_estimators=1000  # кол-во деревьев
                                 )

In [4]:
boston = load_boston()

In [5]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [6]:
target = boston.target

In [7]:
feature_names = boston.feature_names

In [8]:
data = boston.data

In [9]:
x = pd.DataFrame(data, columns = feature_names)

In [10]:
x.head(10)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33
5,0.02985,0.0,2.18,0.0,0.458,6.43,58.7,6.0622,3.0,222.0,18.7,394.12,5.21
6,0.08829,12.5,7.87,0.0,0.524,6.012,66.6,5.5605,5.0,311.0,15.2,395.6,12.43
7,0.14455,12.5,7.87,0.0,0.524,6.172,96.1,5.9505,5.0,311.0,15.2,396.9,19.15
8,0.21124,12.5,7.87,0.0,0.524,5.631,100.0,6.0821,5.0,311.0,15.2,386.63,29.93
9,0.17004,12.5,7.87,0.0,0.524,6.004,85.9,6.5921,5.0,311.0,15.2,386.71,17.1


In [11]:
y = pd.DataFrame(target, columns = ['price'])

In [12]:
y.head()

Unnamed: 0,price
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2


In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [15]:
model.fit(x_train, y_train.values[:, 0])

RandomForestRegressor(max_depth=12, min_samples_leaf=30, n_estimators=1000,
                      random_state=42)

In [16]:
y_pred = model.predict(x_test)

In [17]:
check_test = pd.DataFrame({'y_test': y_test['price'], 'y_pred': y_pred.flatten()}, columns = ['y_test', 'y_pred'])

In [18]:
check_test.head(10)

Unnamed: 0,y_test,y_pred
173,23.6,23.304054
274,32.4,29.4619
491,13.6,15.245442
72,22.8,26.354099
452,16.1,16.63843
76,20.0,21.835057
316,17.8,17.074106
140,14.0,13.675376
471,19.6,21.17999
500,16.8,20.765784


In [19]:
check_test['error'] = check_test['y_pred'] - check_test['y_test']

In [20]:
check_test.head(10)

Unnamed: 0,y_test,y_pred,error
173,23.6,23.304054,-0.295946
274,32.4,29.4619,-2.9381
491,13.6,15.245442,1.645442
72,22.8,26.354099,3.554099
452,16.1,16.63843,0.53843
76,20.0,21.835057,1.835057
316,17.8,17.074106,-0.725894
140,14.0,13.675376,-0.324624
471,19.6,21.17999,1.57999
500,16.8,20.765784,3.965784


In [21]:
from sklearn.metrics import r2_score

In [22]:
y_train = model.predict(x_train)

In [23]:
r2_score(y_train, y_train)

1.0

In [24]:
r2_score(y_test, y_pred)

0.7207632345916866

In [25]:
# модель RandomForestRegressor работает лучше, так как R2 выше !!!

In [27]:
model.feature_importances_

array([1.01352438e-02, 0.00000000e+00, 7.62625999e-05, 0.00000000e+00,
       1.94866375e-03, 4.35823646e-01, 1.85260267e-04, 1.41463161e-02,
       1.81632473e-04, 4.35630102e-04, 2.78545165e-03, 5.93040261e-04,
       5.33688853e-01])

In [31]:
importances = list(model.feature_importances_)
print(importances)

[0.010135243828523103, 0.0, 7.62625999333238e-05, 0.0, 0.0019486637456842387, 0.4358236459431709, 0.00018526026699111536, 0.014146316122115928, 0.00018163247343686131, 0.0004356301020946705, 0.0027854516514709857, 0.0005930402608090898, 0.5336888530057697]


In [32]:
ind_max_feature_1 = importances.index(max(importances))
print(ind_max_feature_1)

12


In [33]:
max_feature_1 = x.columns[ind_max_feature_1]
print(max_feature_1)

LSTAT


In [35]:
importances[ind_max_feature_1] = 0

In [36]:
print(ind_max_feature_1)

12


In [37]:
ind_max_feature_2 = importances.index(max(importances))
print(ind_max_feature_2)

5


In [39]:
max_feature_2 = x.columns[ind_max_feature_2]
print(max_feature_2)

RM


In [40]:
print(f'Сумма всех показателей важности = {sum(importances)}, \nПризнаки имеющие максимальную важность {max_feature_1, max_feature_2}')

Сумма всех показателей важности = 0.46631114699423026, 
Признаки имеющие максимальную важность ('LSTAT', 'RM')
