<a href="https://colab.research.google.com/github/vlad98rus/-netology_pyda/blob/HomeWork-1/DZ_ka4estvo_modeli.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Домашнее задание по теме «Улучшение качества модели. Продвинутые алгоритмы классификации»

Для выполнения домашнего задания необходимо взять boston house-prices datase (sklearn.datasets.load_boston) и сделать тоже самое для задачи регрессии (попробовать разные алгоритмы, поподбирать параметры, вывести итоговое качество).

In [0]:
from sklearn import datasets
from matplotlib import pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso, Ridge, HuberRegressor, ElasticNet
from sklearn.tree import DecisionTreeRegressor
%matplotlib inline
import numpy as np
import pandas as pd
import random

In [0]:
# обеспечиваем воспроизводимость результата
random.seed(42)

In [8]:
boston = datasets.load_boston()
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [9]:
boston.data.shape

(506, 13)

In [10]:
df = pd.DataFrame(boston.data, columns=boston.feature_names)
df.head(5)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [0]:
X, y = boston['data'], boston['target']

In [0]:
# На валидацию откладываем 20%
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2)

In [0]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_valid = sc.transform(X_valid)

## Лассо регрессия

In [0]:
lasso_reg = Lasso()

In [15]:
# Для лассо только один параметр можно подобрать - альфа
lasso_params = {
    'alpha': np.logspace(-7, 2, 1000)
}
grid_lasso = GridSearchCV(lasso_reg, lasso_params, cv=10, verbose=2, n_jobs=-1)
grid_lasso.fit(X_train, y_train)

Fitting 10 folds for each of 1000 candidates, totalling 10000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 296 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 4168 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:   20.8s finished


GridSearchCV(cv=10, error_score=nan,
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': array([1.00000000e-07, 1.02096066e-07, 1.04236067e-07, 1.06420924e-07,
       1.08651577e-07, 1...
       6.74262224e+01, 6.88395207e+01, 7.02824426e+01, 7.17556092e+01,
       7.32596543e+01, 7.47952252e+01, 7.63629826e+01, 7.79636013e+01,
       7.95977700e+01, 8.12661920e+01, 8.29695852e+01, 8.47086827e+01,
       8.64842328e+01, 8.82969996e+01, 9.01477631e+01, 9.20373200e+01,
       9.39664831e+01, 9.59360829e+01, 9.79469667e+01, 1.00000000e+02])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [16]:
print(grid_lasso.best_params_)
print(grid_lasso.best_score_)
print(grid_lasso.best_estimator_)

{'alpha': 0.04546295469532399}
0.7236336045381833
Lasso(alpha=0.04546295469532399, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)


## Ридж-регрессия

In [0]:
rige_reg = Ridge()

In [18]:
# Для ридж-регрессии можно еще перебрать оптимизаторы
rige_params = {
    'alpha': np.logspace(-7, 2, 1000),
    'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}
grid_rige = GridSearchCV(rige_reg, rige_params, cv=10, verbose=2, n_jobs=-1)
grid_rige.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 10 folds for each of 6000 candidates, totalling 60000 fits


[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 4540 tasks      | elapsed:   11.5s
[Parallel(n_jobs=-1)]: Done 11036 tasks      | elapsed:   27.7s
[Parallel(n_jobs=-1)]: Done 20092 tasks      | elapsed:   50.4s
[Parallel(n_jobs=-1)]: Done 31772 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 46012 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 60000 out of 60000 | elapsed:  2.5min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': array([1.00000000e-07, 1.02096066e-07, 1.04236067e-07, 1.06420924e-07,
       1.08651577e-07, 1.10928986e-07, 1.13254132e-07, 1.15628013e-07,
       1.18051653e...
       7.32596543e+01, 7.47952252e+01, 7.63629826e+01, 7.79636013e+01,
       7.95977700e+01, 8.12661920e+01, 8.29695852e+01, 8.47086827e+01,
       8.64842328e+01, 8.82969996e+01, 9.01477631e+01, 9.20373200e+01,
       9.39664831e+01, 9.59360829e+01, 9.79469667e+01, 1.00000000e+02]),
                         'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg',
                                    'sag', 'saga']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=No

In [19]:
print(grid_rige.best_params_)
print(grid_rige.best_score_)
print(grid_rige.best_estimator_)

{'alpha': 11.092898648952227, 'solver': 'sparse_cg'}
0.7242371188047144
Ridge(alpha=11.092898648952227, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='sparse_cg', tol=0.001)


## Регрессия Хьюберта

In [0]:
huber_reg = HuberRegressor()

In [21]:
# Регрессия не шибко быстрая, поэтому сделаем параметров поменьше
huber_params = {
    'alpha': np.logspace(-7, 2, 100),
    'epsilon': np.linspace(1.35, 2, 50)
}
grid_huber = GridSearchCV(huber_reg, huber_params, cv=10, verbose=2, n_jobs=-1)
grid_huber.fit(X_train, y_train)

Fitting 10 folds for each of 5000 candidates, totalling 50000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 228 tasks      | elapsed:    2.1s
[Parallel(n_jobs=-1)]: Done 1196 tasks      | elapsed:   10.8s
[Parallel(n_jobs=-1)]: Done 2820 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 5084 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 8004 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 11564 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 15780 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 20636 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 26148 tasks      | elapsed:  3.9min
[Parallel(n_jobs=-1)]: Done 32300 tasks      | elapsed:  4.8min
[Parallel(n_jobs=-1)]: Done 39108 tasks      | elapsed:  5.8min
[Parallel(n_jobs=-1)]: Done 46556 tasks      | elapsed:  6.8min
[Parallel(n_jobs=-1)]: Done 50000 out of 50000 | elapsed:  7.2min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=HuberRegressor(alpha=0.0001, epsilon=1.35,
                                      fit_intercept=True, max_iter=100,
                                      tol=1e-05, warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': array([1.00000000e-07, 1.23284674e-07, 1.51991108e-07, 1.87381742e-07,
       2.31012970e-07, 2.84803587e-07, 3.51119173e-07, 4.32876128e-07,
       5.33669923e-07, 6.57933225e-07,...
       1.68163265, 1.69489796, 1.70816327, 1.72142857, 1.73469388,
       1.74795918, 1.76122449, 1.7744898 , 1.7877551 , 1.80102041,
       1.81428571, 1.82755102, 1.84081633, 1.85408163, 1.86734694,
       1.88061224, 1.89387755, 1.90714286, 1.92040816, 1.93367347,
       1.94693878, 1.96020408, 1.97346939, 1.98673469, 2.        ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [22]:
print(grid_huber.best_params_)
print(grid_huber.best_score_)
print(grid_huber.best_estimator_)

{'alpha': 0.43287612810830617, 'epsilon': 2.0}
0.7203924187810251
HuberRegressor(alpha=0.43287612810830617, epsilon=2.0, fit_intercept=True,
               max_iter=100, tol=1e-05, warm_start=False)


## ElasticNet регрессия
Как и регрессия Хьюберта объединяет l1 и l2 регуляризации

In [0]:
elast_reg = ElasticNet()

In [24]:
elast_params = {
    'alpha': np.logspace(-7, 2, 200),
    'l1_ratio': np.linspace(0, 1, 50)
}
grid_elast = GridSearchCV(elast_reg, elast_params, cv=10, verbose=2, n_jobs=-1)
grid_elast.fit(X_train, y_train)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


Fitting 10 folds for each of 10000 candidates, totalling 100000 fits


[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed:    1.8s
[Parallel(n_jobs=-1)]: Done 4540 tasks      | elapsed:   10.9s
[Parallel(n_jobs=-1)]: Done 11036 tasks      | elapsed:   25.5s
[Parallel(n_jobs=-1)]: Done 20092 tasks      | elapsed:   45.6s
[Parallel(n_jobs=-1)]: Done 31772 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 46012 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 62876 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done 91516 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 100000 out of 100000 | elapsed:  3.4min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True,
                                  l1_ratio=0.5, max_iter=1000, normalize=False,
                                  positive=False, precompute=False,
                                  random_state=None, selection='cyclic',
                                  tol=0.0001, warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'alpha': array([1.00000000e-07, 1.10975250e-07, 1.23155060e-07, 1.36671636e-0...
       0.51020408, 0.53061224, 0.55102041, 0.57142857, 0.59183673,
       0.6122449 , 0.63265306, 0.65306122, 0.67346939, 0.69387755,
       0.71428571, 0.73469388, 0.75510204, 0.7755102 , 0.79591837,
       0.81632653, 0.83673469, 0.85714286, 0.87755102, 0.89795918,
       0.91836735, 0.93877551, 0.95918367, 0.97959184, 1.        ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [25]:
print(grid_elast.best_params_)
print(grid_elast.best_score_)
print(grid_elast.best_estimator_)

{'alpha': 0.032929712550971546, 'l1_ratio': 0.36734693877551017}
0.7240196499231726
ElasticNet(alpha=0.032929712550971546, copy_X=True, fit_intercept=True,
           l1_ratio=0.36734693877551017, max_iter=1000, normalize=False,
           positive=False, precompute=False, random_state=None,
           selection='cyclic', tol=0.0001, warm_start=False)


## DecisionTree

In [0]:
tree_reg = DecisionTreeRegressor()

In [27]:
tree_params = {
    'max_depth': range(1, 11),
    'splitter': ['best', 'random'],
    'criterion': ['mse', 'mae', 'friedman_mse'],
    'min_samples_leaf': [1, 2, 4, 8, 16]
}
grid_tree = GridSearchCV(tree_reg, tree_params, cv=10, verbose=2, n_jobs=-1)
grid_tree.fit(X_train, y_train)

Fitting 10 folds for each of 300 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 1084 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:   12.4s finished


GridSearchCV(cv=10, error_score=nan,
             estimator=DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse',
                                             max_depth=None, max_features=None,
                                             max_leaf_nodes=None,
                                             min_impurity_decrease=0.0,
                                             min_impurity_split=None,
                                             min_samples_leaf=1,
                                             min_samples_split=2,
                                             min_weight_fraction_leaf=0.0,
                                             presort='deprecated',
                                             random_state=None,
                                             splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'criterion': ['mse', 'mae', 'friedman_mse'],
                         'max_depth': range(1, 11),
                         'min_samp

In [28]:
print(grid_tree.best_params_)
print(grid_tree.best_score_)
print(grid_tree.best_estimator_)

{'criterion': 'mse', 'max_depth': 8, 'min_samples_leaf': 2, 'splitter': 'random'}
0.802412590102431
DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=8,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=2, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='random')


#### На трейн данных пока лучше всех себя показал DecisionTree

## Сравниваем на валидационной выборке

In [0]:
estimators = {
    'lasso': grid_lasso,
    'rige': grid_rige,
    'huber': grid_huber,
    'elasticNet': grid_elast,
    'tree': grid_tree
}

In [30]:
for k in estimators:
    v = estimators[k]
    print(k, "CV R^2:", v.best_score_, "Validation R^2:", v.best_estimator_.score(X_valid, y_valid))

lasso CV R^2: 0.7236336045381833 Validation R^2: 0.6633760189790991
rige CV R^2: 0.7242371188047144 Validation R^2: 0.6671545436571897
huber CV R^2: 0.7203924187810251 Validation R^2: 0.6759996706983933
elasticNet CV R^2: 0.7240196499231726 Validation R^2: 0.666480650986705
tree CV R^2: 0.802412590102431 Validation R^2: 0.6941224054431695


#### Лучше всего показало себя DecisionTree

## Выводим ниши результаты

In [0]:
y_pred = grid_tree.best_estimator_.predict(X_valid)

In [32]:
compare = pd.DataFrame(y_pred, columns=['Предсказание'])
compare['Реальность'] = y_valid
compare['Разница'] = compare['Реальность'] - compare['Предсказание']
compare['Разница, %'] = 100*compare['Разница']/compare['Реальность']
compare.head(15)

Unnamed: 0,Предсказание,Реальность,Разница,"Разница, %"
0,19.434694,17.5,-1.934694,-11.055394
1,19.434694,15.6,-3.834694,-24.581371
2,7.2,13.8,6.6,47.826087
3,24.419444,23.9,-0.519444,-2.173408
4,9.72,16.3,6.58,40.368098
5,23.866667,27.0,3.133333,11.604938
6,19.434694,19.8,0.365306,1.84498
7,30.9,35.4,4.5,12.711864
8,21.9625,21.1,-0.8625,-4.087678
9,21.9625,24.4,2.4375,9.989754


In [33]:
(compare['Разница, %'].max(), compare['Разница, %'].min())

(56.65, -87.6802096985583)