# ML
Применение линейной регрессии, деревьев решений, случайного леса для предсказания рейтинга статей (`votes`).

In [1]:
import pandas as pd
import numpy as np
import csv, json
import re
import time

# предобработка данных
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer, TransformedTargetRegressor
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler, PolynomialFeatures

# ML-модели
from sklearn.linear_model import ElasticNet, Lasso, LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# оптимизация гиперпараметров
from sklearn.model_selection import GridSearchCV

# оценивание моделей
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error as MAE

# сохранение модели
import dill as pickle

In [2]:
df = pd.read_csv('Habr_lemmatized.csv')

In [3]:
# по времени публикации добавляем признак night/morning/afternoon/evening
# предварительно были добавлены признаки: день публикации, месяц публикации, год, день недели

def time_category(x):
    if (x>=0) & (x<6):
        tod = 'night'
    elif (x>=6) & (x<12):
        tod = 'morning'
    elif (x>=12) & (x<18):
        tod = 'afternoon'
    else:
        tod = 'evening'
    return tod

df['time_published'] = pd.to_datetime(df['time_published'])
df['time_category'] = df.time_published.dt.hour.map(time_category) 


df1 = df.copy()

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df1.drop('votes', axis=1), df1.votes, test_size=0.3, random_state=42)

In [14]:
num_cols = ['bookmarks', 'comments_count', 'views','reading_time', 'karma','karma_votes', 'rating','year', 'month','day']
cat_cols = ['is_corporative', 'posttype', 'time_category', 'weekday']

In [15]:
ohe = OneHotEncoder(drop='first', sparse_output=False).set_output(transform='pandas')

X_train_cat = ohe.fit_transform(X_train[cat_cols])
X_test_cat = ohe.transform(X_test[cat_cols])

X_train_ohe = X_train[num_cols].join(X_train_cat)
X_test_ohe = X_test[num_cols].join(X_test_cat)

In [16]:
scaler = StandardScaler().set_output(transform='pandas')
X_train_scaled = scaler.fit_transform(X_train_ohe[num_cols])
X_test_scaled = scaler.transform(X_test_ohe[num_cols])

X_train_ohe_scaled = X_train_ohe.drop(columns=num_cols).join(X_train_scaled)
X_test_ohe_scaled = X_test_ohe.drop(columns=num_cols).join(X_test_scaled)

## Linear Regression

In [332]:
# создание модели линейной регрессии с дефолтными параметрами
lr = LinearRegression()

# расчет метрик на трейне
cv_results = cross_validate(lr, X_train_ohe_scaled, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=10)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# обучение модели
lr.fit(X_train_ohe_scaled, y_train)

# получение предсказаний на тесте
preds_test = lr.predict(X_test_ohe_scaled)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['lr_default'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.402952,10.918145,0.405505,10.89326


In [333]:
# создание Lasso-модели с дефолтными параметрами
lasso = Lasso()

# расчет метрик на трейне
cv_results = cross_validate(lasso, X_train_ohe_scaled, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=10)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# обучение модели
lasso.fit(X_train_ohe_scaled, y_train)

# получение предсказаний на тесте
preds_test = lasso.predict(X_test_ohe_scaled)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['lasso_default'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.402952,10.918145,0.405505,10.89326
1,lasso_default,0.38177,10.993096,0.38442,10.984352


In [334]:
# создание словаря с возможными значениями оптимизируемых гиперпараметров Lasso-модели
lasso_param_grid = {'alpha': np.logspace(-4, 4, num=25)}
# оптимизация гиперпараметров и обучение модели
lasso_grid = GridSearchCV(lasso, param_grid=lasso_param_grid, scoring='r2', cv=3, verbose=1).fit(X_train_ohe_scaled, y_train)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


In [335]:
# расчет метрик на трейне
cv_results = cross_validate(lasso_grid.best_estimator_, X_train_ohe_scaled, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = lasso_grid.best_estimator_.predict(X_test_ohe_scaled)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['lasso_gs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.402952,10.918145,0.405505,10.89326
1,lasso_default,0.38177,10.993096,0.38442,10.984352
2,lasso_gs,0.40293,10.913044,0.405497,10.884462


In [336]:
# создание датафрейма с названиями количественных признаков и соответствующими им коэффициентами обученной Lasso-модели
pd.DataFrame(lasso_grid.best_estimator_.coef_, index=lasso_grid.best_estimator_.feature_names_in_, columns=['lasso_coef']).sort_values(by='lasso_coef', ascending=False)


Unnamed: 0,lasso_coef
comments_count,10.813093
bookmarks,8.111799
karma,7.208136
is_corporative_1,4.749254
year,1.139036
weekday_6,0.906093
reading_time,0.865572
weekday_5,0.355248
time_category_night,0.312793
views,0.238392


In [337]:
# создание ElasticNet-модели
enet = ElasticNet(max_iter=5000)
# словарь с возможными значениями гиперпараметров ElasticNet-модели
enet_param_grid = {
    'alpha': np.logspace(-4, 4, num=25),
    'l1_ratio': np.arange(0.1, 1.0, 0.1)
}

In [338]:
# оптимизация гиперпараметров и обучение модели
enet_grid = GridSearchCV(enet, param_grid=enet_param_grid, scoring='r2', cv=3).fit(X_train_ohe_scaled, y_train)

In [339]:
# расчет метрик на трейне
cv_results = cross_validate(enet_grid.best_estimator_, X_train_ohe_scaled, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = enet_grid.best_estimator_.predict(X_test_ohe_scaled)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['enet_gs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.402952,10.918145,0.405505,10.89326
1,lasso_default,0.38177,10.993096,0.38442,10.984352
2,lasso_gs,0.40293,10.913044,0.405497,10.884462
3,enet_gs,0.402937,10.911618,0.405521,10.883339


In [340]:
enet_grid.best_params_

{'alpha': 0.01, 'l1_ratio': 0.7000000000000001}

In [341]:
# создание Ridge-модели
ridge = Ridge()

# словарь с возможными значениями гиперпараметров Ridge-модели
ridge_param_grid = {'alpha': np.logspace(-4, 4, num=25)}

In [342]:
# оптимизация гиперпараметров и обучение модели
ridge_grid = GridSearchCV(ridge, param_grid=ridge_param_grid, cv=3, scoring='r2').fit(X_train_ohe_scaled, y_train)

In [343]:
# расчет метрик на трейне
cv_results = cross_validate(ridge_grid.best_estimator_, X_train_ohe_scaled, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = ridge_grid.best_estimator_.predict(X_test_ohe_scaled)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['ridge_gs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)
    

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.402952,10.918145,0.405505,10.89326
1,lasso_default,0.38177,10.993096,0.38442,10.984352
2,lasso_gs,0.40293,10.913044,0.405497,10.884462
3,enet_gs,0.402937,10.911618,0.405521,10.883339
4,ridge_gs,0.40293,10.913857,0.405534,10.888201


In [344]:
ridge_grid.best_params_

{'alpha': 464.1588833612773}

**Добавлю в модель информацию о хабах.**

In [22]:
from sklearn.preprocessing import MultiLabelBinarizer

X_train.fillna(value='None', inplace=True)
X_test.fillna(value='None', inplace=True)

def hubs_lists(df):
    h_list = []
    for i in df.index:
        h_list.append(df.hubs[i].split(","))
    return h_list

   
mlb = MultiLabelBinarizer(sparse_output=True)

hubs_train = mlb.fit_transform(hubs_lists(X_train))
hubs_test = mlb.transform(hubs_lists(X_test))




In [23]:
X_train_ohe_scaled1 = X_train_ohe_scaled.join(pd.DataFrame(hubs_train.toarray(), columns = mlb.classes_, index=X_train_ohe_scaled.index))
X_test_ohe_scaled1 = X_test_ohe_scaled.join(pd.DataFrame(hubs_test.toarray(), columns = mlb.classes_, index=X_test_ohe_scaled.index))

In [348]:
# создание модели линейной регрессии с дефолтными параметрами
lr = LinearRegression()

# расчет метрик на трейне
cv_results = cross_validate(lr, X_train_ohe_scaled1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# обучение модели 
lr.fit(X_train_ohe_scaled1, y_train)

# получение предсказаний на тесте
preds_test = lr.predict(X_test_ohe_scaled1)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['lr_default_with_hubs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119


In [352]:
# создание Lasso-модели с дефолтными параметрами
lasso = Lasso()

# расчет метрик на трейне
cv_results = cross_validate(lasso, X_train_ohe_scaled1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# обучение модели 
lasso.fit(X_train_ohe_scaled1, y_train)

# получение предсказаний на тесте
preds_test = lasso.predict(X_test_ohe_scaled1)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['lasso_default_with_hubs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352


In [371]:
# создание словаря с возможными значениями оптимизируемых гиперпараметров Lasso-модели
lasso_param_grid = {'alpha': np.logspace(-6, 6, num=25)}
# оптимизация гиперпараметров и обучение модели
lasso_grid = GridSearchCV(lasso, param_grid=lasso_param_grid, scoring='r2', cv=3, verbose=1).fit(X_train_ohe_scaled1, y_train)

# расчет метрик на трейне
cv_results = cross_validate(lasso_grid.best_estimator_, X_train_ohe_scaled1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = lasso_grid.best_estimator_.predict(X_test_ohe_scaled1)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['lasso_gs_with_hubs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Fitting 3 folds for each of 25 candidates, totalling 75 fits


Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487


In [372]:
lasso_grid.best_estimator_

In [365]:
column_names = list((X_test_ohe_scaled[X_train_ohe_scaled.columns.drop('hubs_vec')]).columns)+list(mlb.classes_)

In [378]:
# создание датафрейма с названиями количественных признаков и соответствующими им коэффициентами обученной Lasso-модели
lassogridsearch_coef = pd.DataFrame(lasso_grid.best_estimator_.coef_, 
             index=column_names, columns=['lasso_coef']).sort_values(by='lasso_coef', ascending=False)

lassogridsearch_coef[abs(lassogridsearch_coef['lasso_coef'])>5]

Unnamed: 0,lasso_coef
brainfuck,16.041631
reverse-engineering,15.655071
bughunters,15.304084
tarantool,13.852256
circuit-design,11.211446
comments_count,10.517816
yii,9.875466
crazydev,9.805693
rust,9.359811
bookmarks,8.419824


In [388]:
lasso_grid.best_estimator_

Lasso(alpha=0.001)

In [379]:
# оптимизация гиперпараметров
enet_grid = GridSearchCV(enet, param_grid=enet_param_grid, scoring='r2', cv=3).fit(X_train_ohe_scaled1, y_train)

In [380]:
# расчет метрик на трейне
cv_results = cross_validate(enet_grid.best_estimator_, X_train_ohe_scaled1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = enet_grid.best_estimator_.predict(X_test_ohe_scaled1)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['enet_gs_with_hubs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407


In [386]:
enet_grid.best_estimator_

(alpha=0.00046415888336127773, l1_ratio=0.1, max_iter=5000)

In [381]:
# оптимизация гиперпараметров 
ridge_grid = GridSearchCV(ridge, param_grid=ridge_param_grid, cv=3, scoring='r2').fit(X_train_ohe_scaled1, y_train)

# расчет метрик на трейне
cv_results = cross_validate(ridge_grid.best_estimator_, X_train_ohe_scaled1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = ridge_grid.best_estimator_.predict(X_test_ohe_scaled1)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['ridge_gs_with_hubs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)
    

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


In [382]:
ridge_grid.best_estimator_

Ridge(alpha=46.41588833612773)

## Decision Tree Regressor

In [439]:
# создание DecisionTreeRegressor-модели
tree = DecisionTreeRegressor(random_state=42)

# расчет метрик на трейне
cv_results = cross_validate(tree, X_train_ohe_scaled, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# обучение модели
tree.fit(X_train_ohe_scaled, y_train)

# получение предсказаний на тесте
preds_test = tree.predict(X_test_ohe_scaled)

# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['tree_default'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


In [440]:
# создание DecisionTreeRegressor-модели (в признаках есть хабы)
tree = DecisionTreeRegressor(random_state=42)

# расчет метрик на трейне
cv_results = cross_validate(tree, X_train_ohe_scaled1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# обучение модели
tree.fit(X_train_ohe_scaled1, y_train)

# получение предсказаний на тесте
preds_test = tree.predict(X_test_ohe_scaled1)

# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['tree_default_with_hubs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


In [456]:
parameters = {"splitter":["best","random"],
            "max_depth" : [1,4,6,8,10,12],
            "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
            "min_weight_fraction_leaf":[0.0,0.25,0.5],
            "max_features":["log2","sqrt"]
           }

In [453]:
# оптимизация гиперпараметров 
tree_grid = GridSearchCV(tree, param_grid=parameters, cv=3, scoring='r2').fit(X_train_ohe_scaled, y_train)

# расчет метрик на трейне
cv_results = cross_validate(tree_grid.best_estimator_, X_train_ohe_scaled, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = tree_grid.best_estimator_.predict(X_test_ohe_scaled)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['tree_gs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


In [457]:
tree_grid.best_estimator_

DecisionTreeRegressor(max_depth=12, max_features='log2', min_samples_leaf=7,
                      random_state=42)

In [458]:
# оптимизация гиперпараметров 
tree_grid = GridSearchCV(tree, param_grid=parameters, cv=3, scoring='r2').fit(X_train_ohe_scaled1, y_train)

# расчет метрик на трейне
cv_results = cross_validate(tree_grid.best_estimator_, X_train_ohe_scaled1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = tree_grid.best_estimator_.predict(X_test_ohe_scaled1)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['tree_gs_with_hubs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


In [462]:
tree_grid.best_estimator_

DecisionTreeRegressor(max_depth=10, max_features='sqrt', min_samples_leaf=5,
                      random_state=42)

**Добавление логарифмированных признаков в модель**

In [25]:
log_cols = ['bookmarks', 'comments_count', 'views']
scale_cols = ['reading_time', 'rating','year','karma','karma_votes','month','day']
cat_cols = ['is_corporative', 'posttype', 'time_category', 'weekday']


# создание препроцессинг-пайплайна
preprocessing = ColumnTransformer([
    ('log', FunctionTransformer(lambda x: np.log(x+1)), log_cols),
    ('scale', StandardScaler(), log_cols+scale_cols),
    ('ohe', OneHotEncoder(sparse_output=False), cat_cols)
], remainder='passthrough', verbose_feature_names_out=False).set_output(transform='pandas')




In [26]:
# создание пайплайна для линейной модели
log_features_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('estimator', LinearRegression())
])

In [120]:

# расчет метрик на трейне
cv_results = cross_validate(log_features_pipeline, X_train[log_cols+scale_cols+cat_cols], y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())


# обучение модели
log_features_pipeline.fit(X_train[log_cols+scale_cols+cat_cols], y_train)

# получение предсказаний на тесте
preds_test = log_features_pipeline.predict(X_test[log_cols+scale_cols+cat_cols])
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['lr_default_log_features'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


In [121]:
X_train1 = X_train[log_cols+scale_cols+cat_cols].join(pd.DataFrame(hubs_train.toarray(), columns = mlb.classes_, index=X_train.index))
X_test1 = X_test[log_cols+scale_cols+cat_cols].join(pd.DataFrame(hubs_test.toarray(), columns = mlb.classes_, index=X_test.index))

In [122]:

# расчет метрик на трейне
cv_results = cross_validate(log_features_pipeline, X_train1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())


# обучение модели
log_features_pipeline.fit(X_train1, y_train)

# получение предсказаний на тесте
preds_test = log_features_pipeline.predict(X_test1)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['lr_default_log_features_with_hubs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


## Random Forest

In [134]:
# Random Forest

random_forest = RandomForestRegressor(n_estimators=100)
 
# расчет метрик на трейне
cv_results = cross_validate(random_forest, X_train_ohe_scaled, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())


# обучение модели
random_forest.fit(X_train_ohe_scaled, y_train)

# получение предсказаний на тесте
preds_test = random_forest.predict(X_test_ohe_scaled)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['rf_default'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


In [170]:
# Random Forest

random_forest = RandomForestRegressor(random_state=42)
 
# расчет метрик на трейне
cv_results = cross_validate(random_forest, X_train_ohe_scaled1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())


# обучение модели
random_forest.fit(X_train_ohe_scaled1, y_train)

# получение предсказаний на тесте
preds_test = random_forest.predict(X_test_ohe_scaled1)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['rf_default_with_hubs'] = [r2_train, mae_train, r2_test, mae_test]

In [172]:
random_forest.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'criterion': 'squared_error',
 'max_depth': None,
 'max_features': 1.0,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}

In [None]:
# Random Forest с помощью RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV

n_estimators = [int(x) for x in np.linspace(start = 50, stop = 200, num = 4)]
max_features = ['log2', 'sqrt']
max_depth = [int(x) for x in np.linspace(start = 1, stop = 15, num = 15)]
min_samples_split = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
min_samples_leaf = [int(x) for x in np.linspace(start = 2, stop = 50, num = 10)]
bootstrap = [True, False]

param_dist = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

# оптимизация гиперпараметров 
rf_randsearch = RandomizedSearchCV(RandomForestRegressor(), 
                        param_dist, 
                        n_iter = 100, 
                        cv = 3, 
                        verbose = 1, 
                        n_jobs=-1, 
                        random_state=42).fit(X_train_ohe_scaled, y_train)

# расчет метрик на трейне
cv_results = cross_validate(rf_randsearch.best_estimator_, X_train_ohe_scaled, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = rf_randsearch.best_estimator_.predict(X_test_ohe_scaled)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['rf_randsearch'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

In [154]:
rs_df = pd.DataFrame(rf_randsearch.cv_results_).sort_values('rank_test_score').reset_index(drop=True)
rs_df = rs_df.drop([
            'mean_fit_time', 
            'std_fit_time', 
            'mean_score_time',
            'std_score_time', 
            'params', 
            'split0_test_score', 
            'split1_test_score', 
            'split2_test_score', 
            'std_test_score'],
            axis=1)
rs_df.head(10)

Unnamed: 0,param_n_estimators,param_min_samples_split,param_min_samples_leaf,param_max_features,param_max_depth,param_bootstrap,mean_test_score,rank_test_score
0,200,7,7,log2,15,False,0.596482,1
1,150,18,7,sqrt,15,False,0.59356,2
2,200,28,2,sqrt,14,False,0.591814,3
3,100,7,2,sqrt,14,True,0.591651,4
4,100,18,7,sqrt,15,True,0.586453,5
5,100,18,18,log2,15,False,0.582368,6
6,200,2,7,sqrt,12,False,0.581474,7
7,50,7,12,log2,15,True,0.580422,8
8,150,50,12,sqrt,13,False,0.578647,9
9,200,39,7,sqrt,12,False,0.578049,10


In [None]:
# Random Forest с помощью GridSearchCV

parameters = { 
    'n_estimators': [50, 100, 150, 200],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [None, 4,6,8,10,12,14,16],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# оптимизация гиперпараметров 
rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid=parameters, cv=3, scoring='r2', random_state=42).fit(X_train_ohe_scaled1, y_train)

# расчет метрик на трейне
cv_results = cross_validate(rf_grid.best_estimator_, X_train_ohe_scaled1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = rf_grid.best_estimator_.predict(X_test_ohe_scaled)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['rf_gs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

In [158]:
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


In [161]:
# Random Forest с помощью GridSearchCV

parameters = { 
    'n_estimators': [100, 125],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,6,8,10,12,14]
}

# оптимизация гиперпараметров 
rf_grid = GridSearchCV(RandomForestRegressor(random_state=42), param_grid=parameters, cv=3, scoring='r2').fit(X_train_ohe_scaled, y_train)

# расчет метрик на трейне
cv_results = cross_validate(rf_grid.best_estimator_, X_train_ohe_scaled, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = rf_grid.best_estimator_.predict(X_test_ohe_scaled)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['rf_gs_1'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


In [None]:
# Random Forest с помощью GridSearchCV
parameters = { 
    'n_estimators': [100, 150],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4,8,10,14,16,18]
}

# оптимизация гиперпараметров 
rf_grid1 = GridSearchCV(RandomForestRegressor(random_state=42), param_grid=parameters, cv=3, scoring='r2').fit(X_train_ohe_scaled1, y_train)

# расчет метрик на трейне
cv_results = cross_validate(rf_grid1.best_estimator_, X_train_ohe_scaled1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# получение предсказаний на тесте
preds_test = rf_grid1.best_estimator_.predict(X_test_ohe_scaled1)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['rf_gs_with_hubs'] = [r2_train, mae_train, r2_test, mae_test]

# вывод на экран рассчитанных метрик
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

In [168]:
rf_grid1.best_estimator_.n_estimators

100

In [165]:
rf_grid1.best_estimator_

In [34]:
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


In [36]:
# создание пайплайна для случайного леса
log_features_pipeline_rf = Pipeline([
    ('preprocessing', preprocessing),
    ('estimator', RandomForestRegressor(random_state=42))
])

In [37]:
# Random Forest

# расчет метрик на трейне
cv_results = cross_validate(log_features_pipeline_rf, X_train[log_cols+scale_cols+cat_cols], y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())

# обучение модели
log_features_pipeline_rf.fit(X_train[log_cols+scale_cols+cat_cols], y_train)

# получение предсказаний на тесте
preds_test = log_features_pipeline_rf.predict(X_test[log_cols+scale_cols+cat_cols])
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['rf_default_log_features'] = [r2_train, mae_train, r2_test, mae_test]

In [38]:
# Random Forest
# расчет метрик на трейне
cv_results = cross_validate(log_features_pipeline_rf, X_train1, y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())


# обучение модели
log_features_pipeline_rf.fit(X_train1, y_train)

# получение предсказаний на тесте
preds_test = log_features_pipeline_rf.predict(X_test1)
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['rf_default_log_features_with_hubs'] = [r2_train, mae_train, r2_test, mae_test]

In [39]:
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


**Добавление полиномов 2 степени в модель.**

In [99]:
# создание пайплайн препроцессинга c добавлением полиномов
preprocessing_poly = ColumnTransformer([
    ('poly', PolynomialFeatures(), log_cols),
    ('scale', StandardScaler(), log_cols+scale_cols),
    ('ohe', OneHotEncoder(sparse_output=False), cat_cols)
], remainder='passthrough', verbose_feature_names_out=False)

preprocessing_poly.set_output(transform='pandas')

# создание модели
rf_poly_pipeline = Pipeline([
    ('poly_preprocessing', preprocessing_poly),
    ('estimator', RandomForestRegressor())
])

# Random Forest
# расчет метрик на трейне
cv_results = cross_validate(rf_poly_pipeline, X_train[log_cols+scale_cols+cat_cols], y_train, scoring=['r2', 'neg_mean_absolute_error'], cv=3)
r2_train = cv_results['test_r2'].mean()
mae_train = abs(cv_results['test_neg_mean_absolute_error'].mean())


# обучение модели
rf_poly_pipeline.fit(X_train[log_cols+scale_cols+cat_cols], y_train)

# получение предсказаний на тесте
preds_test = rf_poly_pipeline.predict(X_test[log_cols+scale_cols+cat_cols])
 
# расчет метрик на тексте
r2_test = r2_score(y_test, preds_test)
mae_test = MAE(y_test, preds_test)

# сохранение метрик в словарь
metrics_dict['rf_default_poly_features'] = [r2_train, mae_train, r2_test, mae_test]

In [103]:
pd.DataFrame(metrics_dict).T.reset_index().rename(columns=rename_metrics_dict)

Unnamed: 0,model,r2_train,mae_train,r2_test,mae_test
0,lr_default,0.4029515,10.91814,0.405505,10.89326
1,lasso_default,0.3817695,10.9931,0.38442,10.984352
2,lasso_gs,0.4029305,10.91304,0.405497,10.884462
3,enet_gs,0.4029366,10.91162,0.405521,10.883339
4,ridge_gs,0.4029297,10.91386,0.405534,10.888201
5,lr_default_with_hubs,-4.403919e+17,50709220.0,0.425725,10.711119
6,lasso_default_with_hubs,0.3816205,10.99437,0.38442,10.984352
7,lasso_gs_with_hubs,0.4209478,10.72902,0.42577,10.695487
8,enet_gs_with_hubs,0.4212766,10.72376,0.425687,10.691407
9,ridge_gs_with_hubs,0.4212603,10.72096,0.425725,10.692329


Были добавлены дополнительные признаки, полученные из даты: день `day`, месяц `month`, год `year`, день недели публикации `weekday`, время суток публикации `time_category`. Произведено логарифморование/полиномизация 2 степени признаков `bookmarks`, `comments_count`, `views`, кодирование `one-hot-encoding` категориальных признаков и cтандартизация числовых признаков. 

Из всех экспериментов применение случайного леса позволило сильнее всего повысить качество модели (по сравнению с моделями линейной регрессией и деревьями решений). При этом добавление в модель информации о хабах, указанных в статье, как ожидалось, не способствовало улучшению качества модели.

Сохраню модель случайного леса с логарифмированными признаками `rf_default_log_features`, при применении которой на тесте R^2 = 0.63, MAE = 8.22

Доля дисперсии зависимой переменной, объясняемая моделью, во всех экспериментах довольно сильно меньше 1, в дальнейшем планируется учесть влияние других неиспользованных признаков: например, текстов публикаций.

## Подготовка к деплойменту

In [70]:
log_cols, scale_cols, cat_cols

(['bookmarks', 'comments_count', 'views'],
 ['reading_time', 'rating', 'year', 'karma', 'karma_votes', 'month', 'day'],
 ['is_corporative', 'posttype', 'time_category', 'weekday'])

In [71]:
X_train_final = X_train[log_cols+scale_cols+cat_cols]
X_test_final = X_test[log_cols+scale_cols+cat_cols]

In [82]:
X_train_final.head()

Unnamed: 0,bookmarks,comments_count,views,reading_time,rating,year,karma,karma_votes,month,day,is_corporative,posttype,time_category,weekday
64064,20,0,9548,2,0.0,2019,293,415,9,8,1,1,morning,6
94864,78,13,6963,4,0.0,2021,22,52,3,22,0,0,afternoon,0
132630,29,2,9705,6,0.0,2022,6,6,8,5,0,0,night,4
153836,3,28,4330,2,23.6,2023,74,186,4,10,0,1,morning,0
38587,13,17,8136,3,0.0,2017,307,1893,6,16,0,0,morning,4


In [109]:
# создание пайплайна с препроцессингом

preprocessing = ColumnTransformer([
    ('log', FunctionTransformer(lambda x: np.log(x+1)), log_cols),
    ('scale', StandardScaler(), log_cols+scale_cols),
    ('ohe', OneHotEncoder(sparse_output=False), cat_cols)
], remainder='passthrough', verbose_feature_names_out=False)

preprocessing.set_output(transform='pandas')

# создание модели
final_pipeline = Pipeline([
    ('preprocessing', preprocessing),
    ('estimator', RandomForestRegressor(random_state=42))
])



In [110]:
preprocessing.fit_transform(X_train_final).head()

Unnamed: 0,bookmarks,comments_count,views,bookmarks.1,comments_count.1,views.1,reading_time,rating,year,karma,...,time_category_evening,time_category_morning,time_category_night,weekday_0,weekday_1,weekday_2,weekday_3,weekday_4,weekday_5,weekday_6
64064,3.044522,0.0,9.164192,-0.301817,-0.402385,-0.178431,-0.69762,-0.516373,-0.357342,0.384287,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
94864,4.369448,2.639057,8.848509,0.529184,-0.223633,-0.251909,-0.335469,-0.516373,0.462243,-0.530741,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
132630,3.401197,1.098612,9.1805,-0.172869,-0.374885,-0.173968,0.026681,-0.516373,0.872035,-0.584765,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
153836,1.386294,3.367296,8.373554,-0.545387,-0.017381,-0.326752,-0.69762,-0.291809,1.281827,-0.355164,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
38587,2.639057,2.890372,9.004177,-0.402111,-0.168632,-0.218567,-0.516545,-0.516373,-1.176926,0.431558,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [111]:
pred_test = final_pipeline.fit(X_train_final, y_train).predict(X_test_final)

print(f'r2_test = {r2_score(y_test, pred_test)} , MAE_test = {MAE(y_test, pred_test)}')

r2_test = 0.629968654610109 , MAE_test = 8.223088954623668


In [112]:
# сохранение модели в формате pickle
with open('final_model_pipeline.pkl', 'wb') as file:
    pickle.dump(final_pipeline, file)