In [2]:
import os
import sys
import pathlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve, auc
import seaborn as sns
import json
import pathlib
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from catboost import CatBoostClassifier, CatBoostRegressor
from xgboost import XGBClassifier, XGBRegressor
from scipy.stats import pearsonr
import copy

In [None]:
for model in [AbRFR, extra_trees_model, xgb_model, catboost_model]:
    mean_pearson = cvPearson(model, 5)
    print(f"Средний коэффициент Пирсона для {model.__class__.__name__}: {mean_pearson:.3f}")
    mean_r2 = np.mean(cross_val_score(model, X, y, cv=5, scoring='r2'))
    print(f"Средний R-squared для {model.__class__.__name__}: {mean_r2:.3f}")

Средний коэффициент Пирсона для RandomForestRegressor: 0.724
Средний R-squared для RandomForestRegressor: 0.480
Средний коэффициент Пирсона для ExtraTreesRegressor: 0.642
Средний R-squared для ExtraTreesRegressor: 0.299
Средний коэффициент Пирсона для XGBRegressor: 0.898
Средний R-squared для XGBRegressor: 0.964
Средний коэффициент Пирсона для CatBoostRegressor: 0.778
Средний R-squared для CatBoostRegressor: 0.632

In [38]:
abrfr_imp = AbRFR.feature_importances_
extra_trees_imp = extra_trees_model.feature_importances_
xgb_imp = xgb_model.feature_importances_
catboost_imp = catboost_model.get_feature_importance()

# Вывод средней важности каждого признака отдельно
for name, abrfr, extra_trees, xgb, catboost in zip(feature_names, abrfr_imp, extra_trees_imp, xgb_imp, catboost_imp):
    mean_imp = np.mean([abrfr, extra_trees, xgb])
    print(f"Признак {name}:")
    print(f"Средняя важность: {mean_imp:.3f}") # без учета catboost 
    print(f" AbRFR: {abrfr:.3f}")
    print(f" ExtraTreesRegressor: {extra_trees:.3f}")
    print(f" XGBRegressor: {xgb:.3f}")
    print(f" CatBoostRegressor: {catboost:.3f}")

Признак Hyd_Hyd_4:
Средняя важность: 0.003
 AbRFR: 0.005
 ExtraTreesRegressor: 0.004
 XGBRegressor: 0.001
 CatBoostRegressor: 0.320
Признак Hyd_Pos_4:
Средняя важность: 0.002
 AbRFR: 0.002
 ExtraTreesRegressor: 0.003
 XGBRegressor: 0.001
 CatBoostRegressor: 0.086
Признак Hyd_Neg_4:
Средняя важность: 0.001
 AbRFR: 0.001
 ExtraTreesRegressor: 0.002
 XGBRegressor: 0.001
 CatBoostRegressor: 0.219
Признак Hyd_Acc_4:
Средняя важность: 0.003
 AbRFR: 0.004
 ExtraTreesRegressor: 0.003
 XGBRegressor: 0.002
 CatBoostRegressor: 0.088
Признак Hyd_Don_4:
Средняя важность: 0.004
 AbRFR: 0.003
 ExtraTreesRegressor: 0.004
 XGBRegressor: 0.004
 CatBoostRegressor: 0.558
Признак Hyd_Aro_4:
Средняя важность: 0.005
 AbRFR: 0.005
 ExtraTreesRegressor: 0.006
 XGBRegressor: 0.003
 CatBoostRegressor: 0.283
Признак Hyd_Sul_4:
Средняя важность: 0.002
 AbRFR: 0.002
 ExtraTreesRegressor: 0.003
 XGBRegressor: 0.002
 CatBoostRegressor: 0.150
Признак Hyd_Neu_4:
Средняя важность: 0.005
 AbRFR: 0.006
 ExtraTreesRegresso

In [12]:
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
    'max_depth': [None, 10, 20, 30, 40, 50],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'bootstrap': [True, False]
}
grid_search = GridSearchCV(estimator=AbRFR, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1)
grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Лучшие параметры: {best_params}")
print(f"Лучшая оценка: {best_score}")

# Оценка модели на тестовом наборе данных
y_pred = grid_search.predict(X_test)
mse = r2_score(y_test, y_pred)
print(f"R2 на тестовом наборе: {mse}")


Лучшие параметры: {'bootstrap': False, 'max_depth': 50, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100}
Лучшая оценка: 0.7376744814162841
R2 на тестовом наборе: 0.695120170393292
