In [45]:
import pandas as pd
import numpy as np
from sklearn import linear_model, tree, model_selection, preprocessing, pipeline

# Регрессия

### Подготовка данных

In [46]:
# Считываем данные из файла и перемешиваем, так как автомобили отсортированы по году выпуска
mpg = pd.read_csv("../data/auto_mpg_preprocessed.csv")
mpg = mpg.drop(columns="Unnamed: 0").sample(frac=1)

In [47]:
# Отделяем ключевой атрибут и наименования автомобилей 
X = mpg.drop(columns=['mpg','car name']).values
y = mpg['mpg'].values

In [48]:
# Объявляем критерии оценки
scores = ['r2', 'neg_mean_squared_error', 'neg_root_mean_squared_error', 'neg_mean_absolute_error']
# Объявляем скейлер
scaler = preprocessing.StandardScaler()

### Линейная регрессия

In [63]:
%%time
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html
LR = linear_model.LinearRegression()
lr_pipe = pipeline.Pipeline(steps=[('scaler', scaler), ('LR', LR)])
# Проводим кросс-валидацию с разбиением на 3 отрезка
lr_cv = model_selection.cross_validate(lr_pipe, X, y, scoring=scores, cv=3)
# Выводим средние результаты
print('R2 score: ', lr_cv['test_r2'].mean())
print('MSE score: ', lr_cv['test_neg_mean_squared_error'].mean())
print('RMSE score: ', lr_cv['test_neg_root_mean_squared_error'].mean())
print('MAE score: ', lr_cv['test_neg_mean_absolute_error'].mean())

R2 score:  0.8094554723789109
MSE score:  -11.61747979203624
RMSE score:  -3.3931547886480495
MAE score:  -2.570631049260393
Wall time: 11 ms


У линейной регрессии нет гиперпараметров, поэтому подбор не проводится

### Регрессия дерева решений

In [50]:
# https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html
tree_regressor = tree.DecisionTreeRegressor()
tree_pipe = pipeline.Pipeline(steps=[('scaler', scaler), ('tree', tree_regressor)])
# Проводим кросс-валидацию
tree_cv = model_selection.cross_validate(tree_pipe, X, y, scoring=scores, cv=3)

# Выводим результаты
print('R2 score: ', tree_cv['test_r2'].mean())
print('MSE score: ', tree_cv['test_neg_mean_squared_error'].mean())
print('RMSE score: ', tree_cv['test_neg_root_mean_squared_error'].mean())
print('MAE score: ', tree_cv['test_neg_mean_absolute_error'].mean())

R2 score:  0.7422165967422627
MSE score:  -15.141692699158346
RMSE score:  -3.8767423903834817
MAE score:  -2.61819534155412


#### Подбор гиперпараметров для дерева решений
Гиперпараметры дерева решений:
- criterion - функция определения качества разделения вершин
- splitter - стратегия разделения вершин
- max_depth - максимальная глубина дерева
- min_samples_leaf - минимальное кол-во объектов в листе

In [51]:
# Подбираем гиперпараметры с помощью GridSearchCV
tree_param_grid = dict(tree__criterion=['mse', 'friedman_mse', 'mae', 'poisson'], tree__splitter=['best', 'random'], tree__min_samples_leaf=[1, 3, 5, 10, 15], tree__max_depth=[5, 10, 20, 50, 100, None])

grid_tree = model_selection.GridSearchCV(tree_pipe, tree_param_grid, cv=5)
grid_tree.fit(X, y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('tree', DecisionTreeRegressor())]),
             param_grid={'tree__criterion': ['mse', 'friedman_mse', 'mae',
                                             'poisson'],
                         'tree__max_depth': [5, 10, 20, 50, 100, None],
                         'tree__min_samples_leaf': [1, 3, 5, 10, 15],
                         'tree__splitter': ['best', 'random']})

In [52]:
%%time
tree_grid_cv = model_selection.cross_validate(grid_tree.best_estimator_, X, y, scoring=scores, cv=3)

# Выводим результаты
print('R2 score: ', tree_grid_cv['test_r2'].mean())
print('MSE score: ', tree_grid_cv['test_neg_mean_squared_error'].mean())
print('RMSE score: ', tree_grid_cv['test_neg_root_mean_squared_error'].mean())
print('MAE score: ', tree_grid_cv['test_neg_mean_absolute_error'].mean())

R2 score:  0.8023507820845799
MSE score:  -12.057305064938092
RMSE score:  -3.4560996936380985
MAE score:  -2.4883875979830177
Wall time: 11 ms


### Lasso регрессия

In [53]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html
lasso = linear_model.Lasso()
lasso_pipe = pipeline.Pipeline(steps=[('scaler', scaler), ('lasso', lasso)])
# Проводим кросс-валидацию
lasso_cv = model_selection.cross_validate(lasso_pipe, X, y, scoring=scores, cv=3)

# Выводим результаты
print('R2 score: ', lasso_cv['test_r2'].mean())
print('MSE score: ', lasso_cv['test_neg_mean_squared_error'].mean())
print('RMSE score: ', lasso_cv['test_neg_root_mean_squared_error'].mean())
print('MAE score: ', lasso_cv['test_neg_mean_absolute_error'].mean())

R2 score:  0.7819015658874232
MSE score:  -13.408621675972988
RMSE score:  -3.630662168024465
MAE score:  -2.6899074265196354


#### Подбор гиперпараметров для Lasso регрессии
Гиперпараметры lasso:
alpha - штрафующий коэффициент

In [54]:
# Подбираем гиперпараметры с помощью GridSearchCV
lasso_param_grid = dict(lasso__alpha=np.arange(0.05, 1, 0.05))

grid_lasso = model_selection.GridSearchCV(lasso_pipe, lasso_param_grid, cv=5)
grid_lasso.fit(X, y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('lasso', Lasso())]),
             param_grid={'lasso__alpha': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95])})

In [55]:
%%time
lasso_grid_cv = model_selection.cross_validate(grid_lasso.best_estimator_, X, y, scoring=scores, cv=3)

# Выводим результаты
print('R2 score: ', lasso_grid_cv['test_r2'].mean())
print('MSE score: ', lasso_grid_cv['test_neg_mean_squared_error'].mean())
print('RMSE score: ', lasso_grid_cv['test_neg_root_mean_squared_error'].mean())
print('MAE score: ', lasso_grid_cv['test_neg_mean_absolute_error'].mean())

R2 score:  0.8090798288094131
MSE score:  -11.664365191858975
RMSE score:  -3.3970543697650712
MAE score:  -2.5547807829731934
Wall time: 12 ms


### Гребневая регрессия

In [56]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html
ridge = linear_model.Ridge()
ridge_pipe = pipeline.Pipeline(steps=[('scaler', scaler), ('ridge', ridge)])
# Проводим кросс-валидацию
ridge_cv = model_selection.cross_validate(ridge_pipe, X, y, scoring=scores, cv=3)

# Выводим результаты
print('R2 score: ', ridge_cv['test_r2'].mean())
print('MSE score: ', ridge_cv['test_neg_mean_squared_error'].mean())
print('RMSE score: ', ridge_cv['test_neg_root_mean_squared_error'].mean())
print('MAE score: ', ridge_cv['test_neg_mean_absolute_error'].mean())

R2 score:  0.8096588253919706
MSE score:  -11.60701653118852
RMSE score:  -3.39129140327991
MAE score:  -2.566253229917146


#### Подбор гиперпараметров для Ridge регрессии
Гиперпараметры ridge:
alpha - штрафующий коэффициент

In [57]:
# Подбираем гиперпараметры с помощью GridSearchCV
ridge_param_grid = dict(ridge__alpha=np.arange(0.05, 1, 0.05))

grid_ridge = model_selection.GridSearchCV(ridge_pipe, ridge_param_grid, cv=5)
grid_ridge.fit(X, y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('ridge', Ridge())]),
             param_grid={'ridge__alpha': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95])})

In [58]:
%%time
ridge_grid_cv = model_selection.cross_validate(grid_ridge.best_estimator_, X, y, scoring=scores, cv=3)

# Выводим результаты
print('R2 score: ', ridge_grid_cv['test_r2'].mean())
print('MSE score: ', ridge_grid_cv['test_neg_mean_squared_error'].mean())
print('RMSE score: ', ridge_grid_cv['test_neg_root_mean_squared_error'].mean())
print('MAE score: ', ridge_grid_cv['test_neg_mean_absolute_error'].mean())

R2 score:  0.809656247707327
MSE score:  -11.607071120402955
RMSE score:  -3.3913169214226038
MAE score:  -2.5663579485007784
Wall time: 9.02 ms


### Elastic Net регрессия

In [59]:
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ElasticNet.html
elastic = linear_model.ElasticNet()
elastic_pipe = pipeline.Pipeline(steps=[('scaler', scaler), ('elastic', elastic)])
# Проводим кросс-валидацию
elastic_cv = model_selection.cross_validate(elastic_pipe, X, y, scoring=scores, cv=3)

# Выводим результаты
print('R2 score: ', elastic_cv['test_r2'].mean())
print('MSE score: ', elastic_cv['test_neg_mean_squared_error'].mean())
print('RMSE score: ', elastic_cv['test_neg_root_mean_squared_error'].mean())
print('MAE score: ', elastic_cv['test_neg_mean_absolute_error'].mean())

R2 score:  0.7569951188406066
MSE score:  -14.894662777770245
RMSE score:  -3.830226611790979
MAE score:  -2.8642537360595544


#### Подбор гиперпараметров для Elastic Net регрессии
Гиперпараметры Elastic Net:
alpha - штрафующий коэффициент
l1_ratio - коэффициент смешивания, т.е. l1_ratio = 0 будет означать использование только l2 регуляризации, а l1_ratio - только l1

In [60]:
# Подбираем гиперпараметры с помощью GridSearchCV
elastic_param_grid = dict(elastic__alpha=np.arange(0.05, 1, 0.05), elastic__l1_ratio=np.arange(0.1, 0.9, 0.05))

grid_elastic = model_selection.GridSearchCV(elastic_pipe, elastic_param_grid, cv=5)
grid_elastic.fit(X, y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('scaler', StandardScaler()),
                                       ('elastic', ElasticNet())]),
             param_grid={'elastic__alpha': array([0.05, 0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55,
       0.6 , 0.65, 0.7 , 0.75, 0.8 , 0.85, 0.9 , 0.95]),
                         'elastic__l1_ratio': array([0.1 , 0.15, 0.2 , 0.25, 0.3 , 0.35, 0.4 , 0.45, 0.5 , 0.55, 0.6 ,
       0.65, 0.7 , 0.75, 0.8 , 0.85])})

In [61]:
%%time
elastic_grid_cv = model_selection.cross_validate(grid_elastic.best_estimator_, X, y, scoring=scores, cv=3)

# Выводим результаты
print('R2 score: ', elastic_grid_cv['test_r2'].mean())
print('MSE score: ', elastic_grid_cv['test_neg_mean_squared_error'].mean())
print('RMSE score: ', elastic_grid_cv['test_neg_root_mean_squared_error'].mean())
print('MAE score: ', elastic_grid_cv['test_neg_mean_absolute_error'].mean())

R2 score:  0.808664497322139
MSE score:  -11.681395157744467
RMSE score:  -3.400346544240081
MAE score:  -2.55690266712776
Wall time: 14.8 ms


In [62]:
# Конечные результаты моделей
print('LR R2 score: ', lr_cv['test_r2'].mean())
print('Tree Regressor R2 score: ', tree_grid_cv['test_r2'].mean())
print('Lasso R2 score: ', lasso_grid_cv['test_r2'].mean())
print('Ridge R2 score: ', ridge_grid_cv['test_r2'].mean())
print('Elastic Net R2 score: ', elastic_grid_cv['test_r2'].mean())

LR R2 score:  0.8094554723789109
Tree Regressor R2 score:  0.8023507820845799
Lasso R2 score:  0.8090798288094131
Ridge R2 score:  0.809656247707327
Elastic Net R2 score:  0.808664497322139


### Выводы
В целом, все модели после подбора параметров выдали практически одинаковые результаты<br>
Скорее всего, это связано с тем, что зависимость ключевого атрибута от большинства параметров является близкой к линейной.<br>