In [135]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline

## Загрузка и знакомство с данными

In [136]:
from sklearn.datasets import load_boston

In [137]:
X, y = load_boston(return_X_y = True)

In [138]:
load_boston()['feature_names']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [139]:
data = pd.DataFrame(X, columns=load_boston()['feature_names'])
data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [140]:
print(load_boston()['DESCR'])

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [141]:
print(data.shape)
print(y.shape)

(506, 13)
(506,)


In [142]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null float64
TAX        506 non-null float64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      506 non-null float64
dtypes: float64(13)
memory usage: 51.5 KB


## Подготовка данных

In [156]:
from sklearn.model_selection import train_test_split

In [157]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [158]:
print(X_train.shape)
print(X_test.shape)

(404, 13)
(102, 13)


In [159]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_sc = scaler.fit_transform(X_train)
X_test_sc = scaler.transform(X_test)

# Алгоритмы

In [160]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [186]:
def huber(y_true, y_pred, d=1):
    return (d * d * (np.sqrt(1 + ((y_true - y_pred)/d) ** 2) - 1)).mean()

### Линейная регрессия

In [162]:
from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [163]:
lr.fit(X_train_sc, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [164]:
pred_lr = lr.predict(X_test_sc)

In [165]:
from sklearn.metrics import mean_squared_error
mse_lr = mean_squared_error(y_test,pred_lr)
mse_lr

27.982856065347175

In [166]:
from sklearn.metrics import mean_absolute_error
mae_lr = mean_absolute_error(y_test,pred_lr)
mae_lr

3.709440443250344

In [167]:
huber_lr = huber(y_test,pred_lr)
huber_lr

3.006868237935676

In [168]:
score_lr = lr.score(X_test_sc, y_test)
score_lr

0.7547223967831923

### GradientBoostingRegressor

In [169]:
from sklearn.ensemble import GradientBoostingRegressor

In [170]:
gbr = GradientBoostingRegressor()

params = dict(loss=['ls', 'lad', 'huber', 'quantile'], 
              learning_rate=[0.001, 0.01, 0.1, 0.2, 0.3],
            n_estimators=[10,50,100, 200],
             max_depth=[2, 3, 4])
params

{'loss': ['ls', 'lad', 'huber', 'quantile'],
 'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3],
 'n_estimators': [10, 50, 100, 200],
 'max_depth': [2, 3, 4]}

In [171]:
rs_gbr = RandomizedSearchCV(gbr, params, n_iter=10, cv=10, random_state=5)

In [172]:
rs_gbr.fit(X_train_sc, y_train)

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'loss': ['ls', 'lad', 'huber', 'quantile'], 'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3], 'n_estimators': [10, 50, 100, 200], 'max_depth': [2, 3, 4]},
          pre_dispatch='2*n_jobs', random_state=5, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [173]:
print(rs_gbr.best_params_)

{'n_estimators': 50, 'max_depth': 4, 'loss': 'huber', 'learning_rate': 0.2}


In [174]:
print(rs_gbr.best_score_)

0.8465783117599719


In [175]:
rs_gbr.cv_results_['mean_test_score']

array([-0.07085747,  0.82694491,  0.83666377,  0.64241935, -1.49241446,
        0.81672448,  0.78301462,  0.05387367,  0.84657831,  0.79992745])

In [176]:
pred_gbr_RS = rs_gbr.best_estimator_.predict(X_test_sc)
mse_gbr_RS = mean_squared_error(y_test, pred_gbr_RS)
mae_gbr_RS = mean_absolute_error(y_test, pred_gbr_RS)
print('mse = ', mse_gbr_RS, ', mae = ', mae_gbr_RS)

mse =  9.269638162977976 , mae =  2.130305880019844


In [177]:
params_for_grid = dict(loss=['ls', 'lad', 'huber', 'quantile'], 
              learning_rate=[0.001, 0.01, 0.1, 0.2, 0.3])
grid_gbr = GridSearchCV(gbr, params_for_grid, cv=10)

In [178]:
grid_gbr.fit(X_train_sc, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=None,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'loss': ['ls', 'lad', 'huber', 'quantile'], 'learning_rate': [0.001, 0.01, 0.1, 0.2, 0.3]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [179]:
print(grid_gbr.best_params_)
print(grid_gbr.best_score_)

{'learning_rate': 0.1, 'loss': 'ls'}
0.868806821978534


In [185]:
pred_gbr_grid = grid_gbr.best_estimator_.predict(X_test_sc)
mse_gbr_grid = mean_squared_error(y_test, pred_gbr_grid)
mae_gbr_grid = mean_absolute_error(y_test, pred_gbr_grid)
print('mse = ', mse_gbr_grid, ', mae = ', mae_gbr_grid)

mse =  9.135036656191973 , mae =  2.3164636696112737


In [188]:
huber_gbr_grid = huber(y_test, pred_gbr_grid)
huber_gbr_grid 

1.6418091091371114

In [189]:
score_gbr = grid_gbr.best_estimator_.score(X_test_sc, y_test)
score_gbr 

0.9199288346015853

### RandomForestRegressor

In [190]:
from sklearn.ensemble import RandomForestRegressor
rfr = RandomForestRegressor()

In [191]:
params_rfr = dict(min_samples_leaf=[1,2,4,6,8],
            n_estimators=[10, 20 ,50, 100],
             max_depth=[2, 5, 8, 11, 14, 17, 20, 23, 26, 29])
params_rfr 

{'min_samples_leaf': [1, 2, 4, 6, 8],
 'n_estimators': [10, 20, 50, 100],
 'max_depth': [2, 5, 8, 11, 14, 17, 20, 23, 26, 29]}

In [192]:
rs_rfr = RandomizedSearchCV(rfr, params_rfr, n_iter=10, cv=10, random_state=5)

In [193]:
rs_rfr.fit(X_train_sc, y_train)

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'min_samples_leaf': [1, 2, 4, 6, 8], 'n_estimators': [10, 20, 50, 100], 'max_depth': [2, 5, 8, 11, 14, 17, 20, 23, 26, 29]},
          pre_dispatch='2*n_jobs', random_state=5, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [194]:
print(rs_rfr.best_params_)
print(rs_rfr.best_score_)

{'n_estimators': 50, 'min_samples_leaf': 1, 'max_depth': 20}
0.838139466361607


In [195]:
for i in zip(rs_rfr.cv_results_['mean_test_score'], rs_rfr.cv_results_['params']):
    print(i)

(0.7804393937048366, {'n_estimators': 100, 'min_samples_leaf': 8, 'max_depth': 17})
(0.7658166569109117, {'n_estimators': 20, 'min_samples_leaf': 8, 'max_depth': 11})
(0.7908438935490243, {'n_estimators': 10, 'min_samples_leaf': 4, 'max_depth': 23})
(0.8142497378072705, {'n_estimators': 20, 'min_samples_leaf': 4, 'max_depth': 23})
(0.7900691682579993, {'n_estimators': 50, 'min_samples_leaf': 6, 'max_depth': 23})
(0.8047998359535132, {'n_estimators': 100, 'min_samples_leaf': 4, 'max_depth': 23})
(0.838139466361607, {'n_estimators': 50, 'min_samples_leaf': 1, 'max_depth': 20})
(0.6789294159097917, {'n_estimators': 50, 'min_samples_leaf': 2, 'max_depth': 2})
(0.7776688702933497, {'n_estimators': 10, 'min_samples_leaf': 4, 'max_depth': 5})
(0.8003271005563415, {'n_estimators': 100, 'min_samples_leaf': 4, 'max_depth': 11})


In [196]:
params_rfr_improved = dict(min_samples_leaf=[1,4,6],
            n_estimators=[20 ,50, 100],
             max_depth=[ 5, 11, 17, 20, 23])
grid_rfr = GridSearchCV(rfr, params_rfr_improved, cv=10)

In [197]:
grid_rfr.fit(X_train_sc, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'min_samples_leaf': [1, 4, 6], 'n_estimators': [20, 50, 100], 'max_depth': [5, 11, 17, 20, 23]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [198]:
print(grid_rfr.best_params_)
print(grid_rfr.best_score_)

{'max_depth': 11, 'min_samples_leaf': 1, 'n_estimators': 20}
0.8458407984063812


In [199]:
pred_rfr_grid = grid_rfr.best_estimator_.predict(X_test_sc)
mse_rfr_grid = mean_squared_error(y_test, pred_rfr_grid)
mae_rfr_grid = mean_absolute_error(y_test, pred_rfr_grid)
huber_rfr_grid = huber(y_test, pred_rfr_grid)
print(f'mse = {mse_rfr_grid}, mae = {mae_rfr_grid}, huber = {huber_rfr_grid}')

mse = 11.061373238894408, mae = 2.392897091422077, huber = 1.7227160768161203


In [200]:
score_rfr = grid_rfr.best_estimator_.score(X_test_sc, y_test)
score_rfr

0.9030439526977962

### SVR

In [201]:
from sklearn.svm import SVR
svr = SVR()

In [202]:
params_svr = dict(C=[0.001, 0.01, 0.1, 0.2, 0.25, 0.5, 0.7, 0.9] , kernel=['linear', 'poly', 'rbf', 'sigmoid'] )
params_svr

{'C': [0.001, 0.01, 0.1, 0.2, 0.25, 0.5, 0.7, 0.9],
 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']}

In [203]:
rs_svr = RandomizedSearchCV(svr, params_svr, n_iter=10, cv=10, random_state=5)
rs_svr.fit(X_train_sc, y_train)

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'C': [0.001, 0.01, 0.1, 0.2, 0.25, 0.5, 0.7, 0.9], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
          pre_dispatch='2*n_jobs', random_state=5, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [204]:
print(rs_svr.best_params_)
print(rs_svr.best_score_)

{'kernel': 'linear', 'C': 0.9}
0.6608041139107277


In [205]:
for i in zip(rs_svr.cv_results_['mean_test_score'], rs_svr.cv_results_['params']):
    print(i)

(0.22384012999057118, {'kernel': 'rbf', 'C': 0.1})
(0.4684664590305557, {'kernel': 'poly', 'C': 0.25})
(0.5376949174397804, {'kernel': 'sigmoid', 'C': 0.9})
(0.1184467540142053, {'kernel': 'poly', 'C': 0.01})
(0.6583896974801192, {'kernel': 'linear', 'C': 0.7})
(0.6590416857148257, {'kernel': 'linear', 'C': 0.5})
(0.44991009294378354, {'kernel': 'poly', 'C': 0.2})
(0.5348445276550965, {'kernel': 'poly', 'C': 0.7})
(-0.023956467134977465, {'kernel': 'rbf', 'C': 0.001})
(0.6608041139107277, {'kernel': 'linear', 'C': 0.9})


In [206]:
params_svr_impruve = dict(C=[0.25, 0.5, 0.7, 0.8, 0.9] , kernel=['linear', 'poly', 'rbf', 'sigmoid'] )
grid_svr = GridSearchCV(svr, params_svr_impruve, cv=10)
grid_svr.fit(X_train_sc, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'C': [0.25, 0.5, 0.7, 0.8, 0.9], 'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [207]:
print(grid_svr.best_params_)
print(grid_svr.best_score_)

{'C': 0.9, 'kernel': 'linear'}
0.6608041139107277


In [208]:
pred_svr_grid = grid_svr.best_estimator_.predict(X_test_sc)
mse_svr_grid = mean_squared_error(y_test, pred_svr_grid)
mae_svr_grid = mean_absolute_error(y_test, pred_svr_grid)
huber_svr_grid = huber(y_test, pred_svr_grid)
print(f'mse = {mse_svr_grid}, mae = {mae_svr_grid}, huber = {huber_svr_grid}')
score_svr = grid_svr.best_estimator_.score(X_test_sc, y_test)
print(score_svr)

mse = 31.41906972645089, mae = 3.6912818943637173, huber = 2.9828436224997494
0.7246030176544804


###  BayesianRidge

In [209]:
from sklearn.linear_model import BayesianRidge
br = BayesianRidge()

In [210]:
params_br = dict(n_iter= [100, 200, 300, 400, 500], 
                 alpha_1=[1.e-7, 1.e-6, 1.e-5, 1.e-4],
                 alpha_2=[1.e-7, 1.e-6, 1.e-5, 1.e-4],
                 lambda_1=[1.e-7, 1.e-6, 1.e-5, 1.e-4],
                 lambda_2=[1.e-7, 1.e-6, 1.e-5, 1.e-4],
                   )
params_br

{'n_iter': [100, 200, 300, 400, 500],
 'alpha_1': [1e-07, 1e-06, 1e-05, 0.0001],
 'alpha_2': [1e-07, 1e-06, 1e-05, 0.0001],
 'lambda_1': [1e-07, 1e-06, 1e-05, 0.0001],
 'lambda_2': [1e-07, 1e-06, 1e-05, 0.0001]}

In [211]:
rs_br = RandomizedSearchCV(br, params_br, n_iter=10, cv=10, random_state=5)
rs_br.fit(X_train_sc, y_train)

RandomizedSearchCV(cv=10, error_score='raise',
          estimator=BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False),
          fit_params=None, iid=True, n_iter=10, n_jobs=1,
          param_distributions={'n_iter': [100, 200, 300, 400, 500], 'alpha_1': [1e-07, 1e-06, 1e-05, 0.0001], 'alpha_2': [1e-07, 1e-06, 1e-05, 0.0001], 'lambda_1': [1e-07, 1e-06, 1e-05, 0.0001], 'lambda_2': [1e-07, 1e-06, 1e-05, 0.0001]},
          pre_dispatch='2*n_jobs', random_state=5, refit=True,
          return_train_score='warn', scoring=None, verbose=0)

In [212]:
print(rs_br.best_params_)
print(rs_br.best_score_)

{'n_iter': 400, 'lambda_2': 1e-05, 'lambda_1': 0.0001, 'alpha_2': 1e-07, 'alpha_1': 1e-07}
0.6825755054989603


In [213]:
grid_br = GridSearchCV(br, params_br, cv=10)
grid_br.fit(X_train_sc, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=BayesianRidge(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True,
       fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300,
       normalize=False, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'n_iter': [100, 200, 300, 400, 500], 'alpha_1': [1e-07, 1e-06, 1e-05, 0.0001], 'alpha_2': [1e-07, 1e-06, 1e-05, 0.0001], 'lambda_1': [1e-07, 1e-06, 1e-05, 0.0001], 'lambda_2': [1e-07, 1e-06, 1e-05, 0.0001]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [214]:
print(grid_br.best_params_)
print(grid_br.best_score_)

{'alpha_1': 1e-07, 'alpha_2': 1e-07, 'lambda_1': 0.0001, 'lambda_2': 0.0001, 'n_iter': 100}
0.6825755057455419


In [215]:
pred_br_grid = grid_br.best_estimator_.predict(X_test_sc)
mse_br_grid = mean_squared_error(y_test, pred_br_grid)
mae_br_grid = mean_absolute_error(y_test, pred_br_grid)
huber_br_grid = huber(y_test, pred_br_grid)
print(f'mse = {mse_br_grid}, mae = {mae_br_grid}, huber = {huber_br_grid}')
score_br = grid_br.best_estimator_.score(X_test_sc, y_test)
print(score_br)

mse = 28.002539391566696, mae = 3.6973801237693764, huber = 2.9866131539343646
0.754549866893206


## Result

In [216]:
print(f'LinearRegression: \n score- {score_lr}, mse- {mse_lr}, mae- {mae_lr} , huber- {huber_lr}')
print(f'GradientBoostingRegressor: \n score- {score_gbr}, mse- {mse_gbr_grid}, mae- {mae_gbr_grid} , huber- {huber_gbr_grid}')
print(f'RandomForestRegressor: \n score- {score_rfr}, mse- {mse_rfr_grid}, mae- {mae_rfr_grid} , huber- {huber_rfr_grid}')
print(f'SVR: \n score- {score_svr}, mse- {mse_svr_grid}, mae- {mae_svr_grid} , huber- {huber_svr_grid}')
print(f'BayesianRidge: \n score- {score_br}, mse- {mse_br_grid}, mae- {mae_br_grid} , huber- {huber_br_grid}')

LinearRegression: 
 score- 0.7547223967831923, mse- 27.982856065347175, mae- 3.709440443250344 , huber- 3.006868237935676
GradientBoostingRegressor: 
 score- 0.9199288346015853, mse- 9.135036656191973, mae- 2.3164636696112737 , huber- 1.6418091091371114
RandomForestRegressor: 
 score- 0.9030439526977962, mse- 11.061373238894408, mae- 2.392897091422077 , huber- 1.7227160768161203
SVR: 
 score- 0.7246030176544804, mse- 31.41906972645089, mae- 3.6912818943637173 , huber- 2.9828436224997494
BayesianRidge: 
 score- 0.754549866893206, mse- 28.002539391566696, mae- 3.6973801237693764 , huber- 2.9866131539343646
