In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.datasets import make_friedman1
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import numpy as np

make_regression - генерируется случайная линейная зависимость


make_friedman1 - y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1) - нелинейная зависимость.

## Линейная зависимость

In [2]:
X_data, y_data = make_regression(n_samples=1000, noise=100, n_features=10, random_state=42)

In [3]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=1), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-23463.506568526143

In [4]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=5), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-17926.96207880936

In [5]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=10), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-23455.492286802135

In [6]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=2), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-23300.957824612145

In [7]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=10), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-18519.987737253417

In [8]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=20), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-17860.475149960232

In [9]:
gs = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid ={
        'criterion': ['mse', 'mae'],
        #'max_depth': range(1, 21),
        'min_samples_leaf': range(1, 21)
    },
    scoring='neg_mean_squared_error'
)
gs.fit(X_data, y_data)

print gs.best_params_
print gs.best_score_

{'criterion': 'mae', 'min_samples_leaf': 17}
-16327.247147


In [10]:
np.mean(cross_val_score(LinearRegression(), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-9551.7114056519895

## Нелинейная зависимость

In [11]:
X_data, y_data = make_friedman1(n_samples=1000, noise=10, n_features=10)

In [12]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=1), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-116.55416188633458

In [13]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=5), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-133.35898500343731

In [14]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=10), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-202.55259034522948

In [15]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=2), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-213.20340984410888

In [16]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=10), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-145.72558785958594

In [17]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=20), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-121.34725896594507

In [18]:
gs = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid ={
        'criterion': ['mse', 'mae'],
        'max_depth': range(1, 21),
        #'min_samples_leaf': range(1, 21)
    },
    scoring='neg_mean_squared_error'
)
gs.fit(X_data, y_data)

print gs.best_params_
print gs.best_score_

{'criterion': 'mae', 'max_depth': 1}
-116.803644753


In [19]:
np.mean(cross_val_score(LinearRegression(), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-108.89047594746562

## Оценка времени работы

In [20]:
X_data, y_data = make_regression(n_samples=100000, noise=1000, n_features=30, random_state=42)

In [21]:
%%timeit
DecisionTreeRegressor(max_depth=1).fit(X_data, y_data)

1 loop, best of 3: 464 ms per loop


In [22]:
%%timeit
DecisionTreeRegressor(max_depth=2).fit(X_data, y_data)

1 loop, best of 3: 927 ms per loop


In [23]:
%%timeit
DecisionTreeRegressor(max_depth=4).fit(X_data, y_data)

1 loop, best of 3: 1.75 s per loop


In [24]:
%%timeit
DecisionTreeRegressor(max_depth=10).fit(X_data, y_data)

1 loop, best of 3: 3.96 s per loop


In [25]:
%%timeit
LinearRegression().fit(X_data, y_data)

10 loops, best of 3: 171 ms per loop
