См. материалы к занятию.

Для выполнения домашнего задания необходимо взять boston house-prices datase (sklearn.datasets.load_boston) и сделать тоже самое для задачи регрессии (попробовать разные алгоритмы, поподбирать параметры, вывести итоговое качество).

In [1]:
from sklearn.datasets import load_boston
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import ElasticNet, SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from datetime import datetime
import pandas as pd
import random

In [2]:
import warnings
warnings.filterwarnings("ignore")

Загружаем данные и нормализуем

In [3]:
data = load_boston()

In [4]:
scaler = StandardScaler()
X = scaler.fit_transform(data.data)
y = data.target
X.shape, y.shape

((506, 13), (506,))

In [5]:
random.seed(42)

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2)

In [7]:
models = [
    (ElasticNet(), dict(
        alpha=[0.2, 0.5, 1.0, 5.0],
        l1_ratio=[0., 0.2, 0.5, 0.8, 1.],
        max_iter=[100, 500, 1000]
    )),
    (SGDRegressor(max_iter=10000), dict(
        loss=["squared_loss", 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
        penalty=['none', 'l2', 'l1', 'elasticnet'],
        alpha=[0.00001, 0.0001, 0.001, 0.01, 0.1],
        eta0=[0.01, 0.1]
    )),
    (LinearSVR(max_iter=10000), dict(
        C=[0.5, 1.0, 2.0],
        loss=['epsilon_insensitive', 'squared_epsilon_insensitive']
    )),
    (KNeighborsRegressor(), dict(
        n_neighbors=[5,10],
        weights=['uniform', 'distance'],
        algorithm=['ball_tree', 'kd_tree', 'brute'],
        p=[1,2]
    )),
    (DecisionTreeRegressor(), dict(
        criterion=['mse', 'friedman_mse'],
        splitter = ['best', 'random'],
        max_depth = [None, 100, 500, 1000],
        min_samples_split = [2, 4, 8],
        min_samples_leaf = [1, 2, 3]
    )),
    (AdaBoostRegressor(), dict(
        n_estimators=[50, 100, 200, 500],
        learning_rate=[1.0, 5.0, 10.],
        loss=['linear', 'square', 'exponential']
    )),
    (BaggingRegressor(), dict(
        n_estimators=[50, 100, 200, 500],
        bootstrap=[False, True]
    )),
]

In [8]:
X_train.shape, y_train.shape

((404, 13), (404,))

In [9]:
%%time

results = []
for estimator, params in models:
    start = datetime.now()
    gs = GridSearchCV(estimator, params, cv=5, scoring='r2', n_jobs=-1)
    gs.fit(X_train, y_train)
    end = datetime.now()
    y_pred = gs.predict(X_val)
    r2, mse = r2_score(y_val, y_pred), mean_squared_error(y_val, y_pred)
    results.append((estimator.__class__.__name__, gs.best_params_, r2, mse, end - start))

Wall time: 37.6 s


In [10]:
pd.DataFrame(results, columns=['Estimator', 'Best params', 'R2', 'MSE', 'Time']).sort_values(by=['R2'], ascending=False)

Unnamed: 0,Estimator,Best params,R2,MSE,Time
5,AdaBoostRegressor,"{'learning_rate': 1.0, 'loss': 'square', 'n_es...",0.867738,11.47351,00:00:18.852000
6,BaggingRegressor,"{'bootstrap': True, 'n_estimators': 50}",0.859031,12.228897,00:00:11.323000
3,KNeighborsRegressor,"{'algorithm': 'ball_tree', 'n_neighbors': 5, '...",0.796968,17.61277,00:00:00.271000
4,DecisionTreeRegressor,"{'criterion': 'mse', 'max_depth': 100, 'min_sa...",0.761653,20.676255,00:00:00.828000
1,SGDRegressor,"{'alpha': 0.0001, 'eta0': 0.01, 'loss': 'squar...",0.669809,28.64366,00:00:03.363000
2,LinearSVR,"{'C': 0.5, 'loss': 'squared_epsilon_insensitive'}",0.66607,28.967939,00:00:00.301000
0,ElasticNet,"{'alpha': 0.2, 'l1_ratio': 0.2, 'max_iter': 100}",0.656368,29.809602,00:00:02.608000


Ансамбли деревьев по итогу дают лучший результат