In [23]:
import re
import os
import sys
import json
from pathlib import Path
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import PCA
import warnings

warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler, MinMaxScaler
PROJECT_DIR = Path(os.getcwd())
sys.path.append(str(Path(PROJECT_DIR, 'utils')))
import make_full_pipeline
import hyperparameters_tuning
import importlib
importlib.reload(make_full_pipeline)
importlib.reload(hyperparameters_tuning)
from make_full_pipeline import Pipeline_log_regression, PipelineCatBoostClassifier
from hyperparameters_tuning import custom_grid_search, custom_grid_search_catboost
#import make_full_pipeline

# Загрузка данных

In [60]:
X_train = pd.read_csv(str(Path(PROJECT_DIR, 'ds_problem', 'problem_train.csv')), low_memory=False)
labels = pd.read_csv(str(Path(PROJECT_DIR, 'ds_problem', 'problem_labels.csv')))
X_test = pd.read_csv(str(Path(PROJECT_DIR, 'ds_problem', 'problem_test.csv')), low_memory=False)

In [30]:
labels.iloc[:, 1]

0       1
1       0
2       0
3       0
4       0
       ..
7995    1
7996    0
7997    1
7998    1
7999    1
Name: service_a, Length: 8000, dtype: int64

In [38]:
X_train.shape

(8000, 1379)

In [21]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Columns: 1379 entries, id to c_1377
dtypes: float64(345), int64(9), object(1025)
memory usage: 84.2+ MB


In [22]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 1379 entries, id to c_1377
dtypes: float64(457), int64(9), object(913)
memory usage: 21.0+ MB


In [38]:
cat_features_train = [col for col in X_train.columns if X_train[col].dtype == 'object']
cat_features_test = [col for col in X_test.columns if X_test[col].dtype == 'object']
int_feat = set(cat_features_train) - set(cat_features_test)
X_test[list(int_feat)]

Unnamed: 0,c_0754,c_0973,c_0741,c_1013,c_0879,c_0429,c_0641,c_0769,c_0999,c_0505,...,c_0821,c_0481,c_0697,c_1102,c_1207,c_0532,c_1108,c_0857,c_0603,c_1349
0,,,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,,,,,,,,,,,...,,,,,,,,,,
1996,,,,,,,,,,,...,,,,,,,,,,
1997,,,,,,,,,,,...,,,,,,,,,,
1998,,,,,,,,,,,...,,,,,,,,,,


# Логистическая регрессия

# Пробный запуск на произвольных параметрах

In [64]:
feature_selection_params = {'min_filled_ratio': 0.8}
my_transformer_params = {
    'strategy_cat': 'most_frequent',
    'strategy_num': 'mean',
    'scaler': StandardScaler(),
    'use_pca': True,
    'pca_percentage': 0.90
}
logistic_regression_params = {'solver': 'saga', 'max_iter': 1000, 'penalty': 'l2', 'verbose': 1}

pipeline_log_regression = Pipeline_log_regression(feature_selection_params=feature_selection_params,
                                                                     my_transformer_params=my_transformer_params, 
                                                                     logistic_regression_params=logistic_regression_params)

In [69]:
select = make_full_pipeline.FeatureSelection(**feature_selection_params)
X_new = select.fit_transform(X_train)
X_new

Unnamed: 0,id,release,n_0002,n_0005,n_0019,n_0038,n_0067,n_0078,n_0083,n_0108,...,c_1223,c_1227,c_1236,c_1244,c_1252,c_1259,c_1286,c_1316,c_1348,c_1372
0,11193,a,0.025449,0.368421,0.0,0.193548,0.928571,0.800000,1.000000,0.800000,...,c,a,c,d,b,n,b,b,b,a
1,11382,a,0.031297,0.315789,0.0,0.177419,0.928571,0.666667,0.000000,0.666667,...,a,a,c,d,b,e,b,b,b,a
2,16531,a,0.024475,0.342105,0.0,0.290323,0.428571,0.833333,1.000000,0.833333,...,c,a,a,d,b,w,b,b,b,a
3,1896,a,0.041694,0.447368,0.0,0.370968,0.571429,0.566667,0.833333,0.566667,...,c,a,c,d,b,e,b,a,b,a
4,18262,c,0.038120,0.315789,0.0,0.177419,0.928571,0.600000,0.666667,0.600000,...,c,a,c,d,b,e,b,b,b,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,10898,a,0.053931,0.394737,0.0,0.209677,0.000000,0.166667,0.000000,0.166667,...,c,b,c,a,b,i,a,a,c,a
7996,16664,a,0.031731,0.394737,0.0,0.290323,0.714286,0.733333,0.083333,0.733333,...,c,a,c,d,b,w,b,b,b,a
7997,5334,c,0.033463,0.394737,0.0,0.177419,1.000000,0.833333,1.000000,0.833333,...,c,a,c,d,b,n,b,b,b,a
7998,7905,c,0.047109,0.289474,0.2,0.177419,0.357143,0.400000,1.000000,0.400000,...,c,a,a,a,b,n,b,b,b,a


In [70]:
trans = make_full_pipeline.MyTransformer(**my_transformer_params)
X_newnew = trans.fit_transform(X_new, labels.iloc[:, 1])
X_newnew

Unnamed: 0,PC1,release,c_0368,c_0401,c_0426,c_0444,c_0456,c_0461,c_0466,c_0500,...,c_1227,c_1236,c_1244,c_1252,c_1259,c_1286,c_1316,c_1348,c_1372,o_0264
0,0.879940,0.066935,0.427676,0.240086,0.732389,0.532052,0.249269,0.578505,-0.451807,0.070930,...,-0.556174,0.558100,-0.921129,0.267172,1.511940,-0.326579,-0.490214,0.258657,0.268512,1.078807
1,0.800597,0.066935,0.427676,0.240086,-1.365395,0.532052,0.249269,0.578505,-0.451807,-1.623603,...,-0.556174,0.558100,-0.921129,0.267172,-1.017859,-0.326579,-0.490214,0.258657,0.268512,1.078807
2,-0.570815,0.066935,0.427676,0.240086,0.732389,0.532052,0.249269,-1.728594,2.431598,0.070930,...,-0.556174,-1.791794,-0.921129,0.267172,-0.977762,-0.326579,-0.490214,0.258657,0.268512,-2.049304
3,-0.725495,0.066935,0.427676,0.240086,0.732389,0.532052,0.249269,0.578505,2.431598,-1.623603,...,-0.556174,0.558100,-0.921129,0.267172,-1.017859,-0.326579,2.039924,0.258657,0.268512,-0.261812
4,0.352902,-1.728382,0.427676,0.240086,-1.365395,0.532052,0.249269,-1.728594,-0.373489,0.804289,...,-0.556174,0.558100,-0.921129,0.267172,-1.017859,-0.326579,-0.490214,0.258657,0.268512,0.631934
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,-1.641592,0.066935,0.427676,0.240086,0.732389,0.532052,0.249269,-1.728594,-0.373489,-1.623603,...,1.798000,0.558100,0.813607,0.267172,1.413996,3.062047,2.039924,-3.844358,0.268512,0.185061
7996,0.069806,0.066935,0.427676,0.240086,0.732389,-1.879515,0.249269,-1.728594,2.431598,-1.623603,...,-0.556174,0.558100,-0.921129,0.267172,-0.977762,-0.326579,-0.490214,0.258657,0.268512,0.185061
7997,1.442170,-1.728382,0.427676,0.240086,-1.365395,0.532052,0.249269,0.578505,-0.451807,0.070930,...,-0.556174,0.558100,-0.921129,0.267172,1.511940,-0.326579,-0.490214,0.258657,0.268512,1.525680
7998,-1.097215,-1.728382,0.427676,0.240086,0.732389,0.532052,-4.011735,0.578505,-0.373489,0.804289,...,-0.556174,-1.791794,0.813607,0.267172,1.511940,-0.326579,-0.490214,0.258657,0.268512,-0.261812


In [4]:
feature_selection_params = {'min_filled_ratio': 0.8}
my_transformer_params = {
    'strategy_cat': 'most_frequent',
    'strategy_num': 'mean',
    'scaler': StandardScaler(),
    'pca_percentage': 0.95
}
logistic_regression_params = {'solver': 'saga', 'max_iter': 1000, 'penalty': 'l2', 'verbose': 1}
log_loss_scores = []
roc_auc_scores = []
for i in range(1, labels.shape[1]):
    y = labels.iloc[:, i]
    X_train_, X_test_, y_train, y_test = train_test_split(
        X_train, y, test_size=0.2, random_state=42, stratify=y
    )
    pipeline_log_regression = Pipeline_log_regression(feature_selection_params=feature_selection_params,
                                                                     my_transformer_params=my_transformer_params, 
                                                                     logistic_regression_params=logistic_regression_params)

    pipeline_log_regression.fit(X_train_, y_train)
    # y_pred_proba = pipeline_log_regression.predict_proba(X_test_)
    # cur_log_loss = log_loss(y_test, y_pred_proba[:, 1])
    dct = pipeline_log_regression.evaluate_metrics(X_test_, y_test)
    cur_log_loss, cur_roc_auc = dct['logloss'], dct['roc_auc']
    print(f'LogLoss and ROC auc on {i} label {cur_log_loss} and {cur_roc_auc}')
    log_loss_scores.append(cur_log_loss)
    roc_auc_scores.append(cur_roc_auc)

print(f'Mean LogLoss is equal {sum(log_loss_scores) / len(log_loss_scores)}', 
     f'Mean LogLoss is equal {sum(roc_auc_scores) / len(roc_auc_scores)}', sep='\n')

convergence after 432 epochs took 4 seconds
LogLoss and ROC auc on 1 label 0.422373571348277 and 0.7951330381284533
convergence after 364 epochs took 3 seconds
LogLoss and ROC auc on 2 label 0.5178408721121552 and 0.6994905869324475
convergence after 311 epochs took 3 seconds
LogLoss and ROC auc on 3 label 0.5095554825897006 and 0.5587463801357613
max_iter reached after 9 seconds




LogLoss and ROC auc on 4 label 0.05122431873121796 and 0.5721668903487086
convergence after 594 epochs took 5 seconds
LogLoss and ROC auc on 5 label 0.18151838700170025 and 0.5044929495253602
convergence after 684 epochs took 6 seconds
LogLoss and ROC auc on 6 label 0.0662948185766507 and 0.6115479990683783
convergence after 526 epochs took 5 seconds
LogLoss and ROC auc on 7 label 0.1818698717906144 and 0.5107298012087903
max_iter reached after 8 seconds




LogLoss and ROC auc on 8 label 0.487025991572253 and 0.6650760233918128
convergence after 694 epochs took 6 seconds
LogLoss and ROC auc on 9 label 0.08159117145664802 and 0.4993646759847522
convergence after 393 epochs took 4 seconds
LogLoss and ROC auc on 10 label 0.3903840050625679 and 0.5549429721213885
convergence after 307 epochs took 3 seconds
LogLoss and ROC auc on 11 label 0.46886696644468356 and 0.5832663382188232
convergence after 301 epochs took 3 seconds
LogLoss and ROC auc on 12 label 0.25138735428997516 and 0.6032563565586886
convergence after 355 epochs took 3 seconds
LogLoss and ROC auc on 13 label 0.21726858218092185 and 0.6047885391912518
convergence after 334 epochs took 3 seconds
LogLoss and ROC auc on 14 label 0.35625552454750364 and 0.5634447335067708
Mean LogLoss is equal 0.2988183512646336
Mean LogLoss is equal 0.5947462345943848


In [71]:
np.logspace(-3, 2, 10)

array([1.00000000e-03, 3.59381366e-03, 1.29154967e-02, 4.64158883e-02,
       1.66810054e-01, 5.99484250e-01, 2.15443469e+00, 7.74263683e+00,
       2.78255940e+01, 1.00000000e+02])

# Подбор гиперпараметров для кастомного пайплайна с лог регрессией

Будем использовать обычную дискретную сетку параметров и подбирать с помощью GridSearchCV, так как зачастую на моей практике обычный поиск по сетке дает лучше качество чем замороченные подборы из optuna (Байесовская оптимизация и тд). Плюс он намного быстрее чем его более сложный аналог.

In [78]:
param_distributions = {
    'feature_selection_params__min_filled_ratio': [0.95, 0.8, 0.6],
    'my_transformer_params__strategy_cat': ['most_frequent'],
    'my_transformer_params__strategy_num': ['mean'],
    'my_transformer_params__scaler': [StandardScaler(), MinMaxScaler()],
    'my_transformer_params__use_pca': [False, True],
    'my_transformer_params__pca_percentage': [0.90],
    'logistic_regression_params__penalty': ['l2'],
    'logistic_regression_params__C': [0.01, 0.1, 1, 10],
    'logistic_regression_params__solver': ['saga', 'lbfgs'],
    'logistic_regression_params__max_iter': [500, 1000]
}
best_params_for_all_labels = {i:{} for i in range(1, labels.shape[1])}
all_params = []
for i in range(1, labels.shape[1]):
    y = labels.iloc[:, i].values
    
    pipeline = Pipeline_log_regression()
    results = custom_grid_search(X_train, y, pipeline, param_distributions, cv_splits=3)

    # Лучшие параметры
    best_result = results[0]
    best_params_for_all_labels[i] = best_result
    all_params.append(results)
    
    print(f"Лучшие параметры для {i}-го labels:", best_result["params"])
    print(f"Лучший LogLoss для {i}-го labels:", best_result["logloss"])
    print(f"Лучший ROC AUC для {i}-го labels:", best_result["roc_auc"])




Лучшие параметры для 1-го labels: {'feature_selection_params__min_filled_ratio': 0.6, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': StandardScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.01, 'logistic_regression_params__solver': 'lbfgs', 'logistic_regression_params__max_iter': 500}
Лучший LogLoss для 1-го labels: 0.3903035684545575
Лучший ROC AUC для 1-го labels: 0.8334167877665979




Лучшие параметры для 2-го labels: {'feature_selection_params__min_filled_ratio': 0.6, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': MinMaxScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.01, 'logistic_regression_params__solver': 'lbfgs', 'logistic_regression_params__max_iter': 500}
Лучший LogLoss для 2-го labels: 0.5177541004691174
Лучший ROC AUC для 2-го labels: 0.6832664169253077




Лучшие параметры для 3-го labels: {'feature_selection_params__min_filled_ratio': 0.8, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': MinMaxScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.01, 'logistic_regression_params__solver': 'saga', 'logistic_regression_params__max_iter': 1000}
Лучший LogLoss для 3-го labels: 0.5220802481410182
Лучший ROC AUC для 3-го labels: 0.5483847500762287




Лучшие параметры для 4-го labels: {'feature_selection_params__min_filled_ratio': 0.6, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': MinMaxScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.1, 'logistic_regression_params__solver': 'saga', 'logistic_regression_params__max_iter': 500}
Лучший LogLoss для 4-го labels: 0.04907697189780335
Лучший ROC AUC для 4-го labels: 0.6197597611318596




Лучшие параметры для 5-го labels: {'feature_selection_params__min_filled_ratio': 0.6, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': StandardScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.01, 'logistic_regression_params__solver': 'saga', 'logistic_regression_params__max_iter': 500}
Лучший LogLoss для 5-го labels: 0.18356338931543334
Лучший ROC AUC для 5-го labels: 0.5058295364850062




Лучшие параметры для 6-го labels: {'feature_selection_params__min_filled_ratio': 0.8, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': MinMaxScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.1, 'logistic_regression_params__solver': 'saga', 'logistic_regression_params__max_iter': 500}
Лучший LogLoss для 6-го labels: 0.06990270509682352
Лучший ROC AUC для 6-го labels: 0.5841505925515196




Лучшие параметры для 7-го labels: {'feature_selection_params__min_filled_ratio': 0.8, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': StandardScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.01, 'logistic_regression_params__solver': 'lbfgs', 'logistic_regression_params__max_iter': 500}
Лучший LogLoss для 7-го labels: 0.17798877524041964
Лучший ROC AUC для 7-го labels: 0.5032190604288744




Лучшие параметры для 8-го labels: {'feature_selection_params__min_filled_ratio': 0.8, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': MinMaxScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.01, 'logistic_regression_params__solver': 'saga', 'logistic_regression_params__max_iter': 1000}
Лучший LogLoss для 8-го labels: 0.4743981912163322
Лучший ROC AUC для 8-го labels: 0.6689736457423757




Лучшие параметры для 9-го labels: {'feature_selection_params__min_filled_ratio': 0.8, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': MinMaxScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.01, 'logistic_regression_params__solver': 'lbfgs', 'logistic_regression_params__max_iter': 500}
Лучший LogLoss для 9-го labels: 0.07287468791771955
Лучший ROC AUC для 9-го labels: 0.5




Лучшие параметры для 10-го labels: {'feature_selection_params__min_filled_ratio': 0.8, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': MinMaxScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.01, 'logistic_regression_params__solver': 'lbfgs', 'logistic_regression_params__max_iter': 500}
Лучший LogLoss для 10-го labels: 0.38292228208923734
Лучший ROC AUC для 10-го labels: 0.5654191349972997




Лучшие параметры для 11-го labels: {'feature_selection_params__min_filled_ratio': 0.95, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': MinMaxScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 1, 'logistic_regression_params__solver': 'saga', 'logistic_regression_params__max_iter': 1000}
Лучший LogLoss для 11-го labels: 0.4764691340237448
Лучший ROC AUC для 11-го labels: 0.5660837002236407




Лучшие параметры для 12-го labels: {'feature_selection_params__min_filled_ratio': 0.6, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': StandardScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.01, 'logistic_regression_params__solver': 'lbfgs', 'logistic_regression_params__max_iter': 500}
Лучший LogLoss для 12-го labels: 0.23945948715959134
Лучший ROC AUC для 12-го labels: 0.6243772090338636




Лучшие параметры для 13-го labels: {'feature_selection_params__min_filled_ratio': 0.6, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': StandardScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.01, 'logistic_regression_params__solver': 'lbfgs', 'logistic_regression_params__max_iter': 500}
Лучший LogLoss для 13-го labels: 0.20667891269199523
Лучший ROC AUC для 13-го labels: 0.5974325259274443




Лучшие параметры для 14-го labels: {'feature_selection_params__min_filled_ratio': 0.8, 'my_transformer_params__strategy_cat': 'most_frequent', 'my_transformer_params__strategy_num': 'mean', 'my_transformer_params__scaler': MinMaxScaler(), 'my_transformer_params__use_pca': False, 'my_transformer_params__pca_percentage': 0.9, 'logistic_regression_params__penalty': 'l2', 'logistic_regression_params__C': 0.1, 'logistic_regression_params__solver': 'lbfgs', 'logistic_regression_params__max_iter': 500}
Лучший LogLoss для 14-го labels: 0.3637229951277641
Лучший ROC AUC для 14-го labels: 0.5713727081378027


In [87]:
best_params_for_all_labels[1]

{'params': {'feature_selection_params__min_filled_ratio': 0.6,
  'my_transformer_params__strategy_cat': 'most_frequent',
  'my_transformer_params__strategy_num': 'mean',
  'my_transformer_params__scaler': StandardScaler(),
  'my_transformer_params__use_pca': False,
  'my_transformer_params__pca_percentage': 0.9,
  'logistic_regression_params__penalty': 'l2',
  'logistic_regression_params__C': 0.01,
  'logistic_regression_params__solver': 'lbfgs',
  'logistic_regression_params__max_iter': 500},
 'logloss': 0.3903035684545575,
 'roc_auc': 0.8334167877665979}

# Средние значения метрик на лучших параметрах по всем таргетам

In [86]:
log_ = 0
roc_ = 0
for key in best_params_for_all_labels:
    log_ += best_params_for_all_labels[key]['logloss']
    roc_ += best_params_for_all_labels[key]['roc_auc']
print(f'Mean metrics LogLoss: {log_/len(best_params_for_all_labels)}, roc auc: {roc_/len(best_params_for_all_labels)}')

Mean metrics LogLoss: 0.2947996749172542, roc auc: 0.5979775592448443


In [88]:
y_test = pd.DataFrame()
for i in range(1, labels.shape[1]):
    y = labels.iloc[:, i]
    params = best_params_for_all_labels[i]['params']
    pipeline = Pipeline_log_regression()
    pipeline.set_params(**params)
    pipeline.fit(X_train, y)

    y_pred_proba = pipeline.predict_proba(X_test)[:, 1]  # Берем вероятности для класса 1
    
    y_test[f'label_{i}'] = y_pred_proba
y_test



Unnamed: 0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10,label_11,label_12,label_13,label_14
0,0.986220,0.553300,0.231167,0.000169,0.007305,0.000203,0.018712,0.079730,0.012906,0.915204,0.900588,0.001799,0.001455,0.107867
1,0.014209,0.050486,0.056619,0.092232,0.028761,0.000091,0.002987,0.142159,0.000829,0.886626,0.929991,0.021327,0.016646,0.072718
2,0.584944,0.771090,0.399411,0.000089,0.027439,0.000847,0.067183,0.204110,0.010951,0.641996,0.658927,0.007627,0.005653,0.252083
3,0.151878,0.052318,0.246243,0.000908,0.021560,0.000565,0.125253,0.327878,0.043074,0.849342,0.737922,0.333220,0.202433,0.159786
4,0.892496,0.559289,0.371619,0.000011,0.006919,0.001644,0.092282,0.173180,0.008613,0.695299,0.484684,0.025068,0.014902,0.008614
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.102323,0.097502,0.085248,0.000193,0.044334,0.063220,0.016387,0.230980,0.012979,0.921451,0.835476,0.135333,0.119962,0.003006
1996,0.863646,0.218510,0.371108,0.001226,0.004629,0.202698,0.063378,0.233760,0.015992,0.841090,0.801543,0.054647,0.064907,0.167379
1997,0.364852,0.220862,0.268962,0.000854,0.043374,0.002189,0.054556,0.257434,0.033537,0.874723,0.680260,0.036381,0.032753,0.312377
1998,0.256029,0.205566,0.226538,0.000005,0.014791,0.000693,0.023515,0.181057,0.029157,0.950734,0.855394,0.026312,0.022477,0.004689


# Сохранение первого результата

In [89]:
results_dir = Path(PROJECT_DIR, 'results')
y_test.to_csv(results_dir / 'y_log_reg.csv', index=False)

# Эксперимент со вторым пайплайном (CatBoost)

Теперь возьмем за основу более продвинутую модель - CatBoostClassifier. В пайплайне оставим FeatureSelection, но будем его использовать по флагу (далее это будет гиперпараметр). Есть гипотеза, что он не нужен здесь, так как catboost сам отлично умеет фильтровать признаки, заполнять nan и преобразовывать категориальные признаки. В конце добавится возможность калибровки, так как градиентные бустинги (классификаторы) плохо изначально откалиброваны. Но в catboost из коробки predict_proba выдает достаточно неплохие вероятности, поэтому будем сравнивать метрики на обученой модели с калибровкой и без нее (тоже своего рода гиперпараметр).

In [93]:
params = {'iterations': 100, 'depth': 6, 'learning_rate': 0.1, 'loss_function': 'Logloss', 'eval_metric': 'Logloss'}

model = CatBoostClassifier(**params)

# Пробные запуск

In [5]:
model_catboost = PipelineCatBoostClassifier()
model_catboost.fit(X_train, labels.iloc[:, 1])

0:	learn: 0.6380178	total: 662ms	remaining: 1m 5s
1:	learn: 0.5944063	total: 1.15s	remaining: 56.5s
2:	learn: 0.5633044	total: 1.41s	remaining: 45.5s
3:	learn: 0.5300563	total: 1.89s	remaining: 45.3s
4:	learn: 0.5091777	total: 2.3s	remaining: 43.7s
5:	learn: 0.4923494	total: 2.69s	remaining: 42.2s
6:	learn: 0.4735067	total: 3.08s	remaining: 40.9s
7:	learn: 0.4624878	total: 3.5s	remaining: 40.3s
8:	learn: 0.4507276	total: 3.95s	remaining: 39.9s
9:	learn: 0.4365399	total: 4.33s	remaining: 38.9s
10:	learn: 0.4270693	total: 4.74s	remaining: 38.3s
11:	learn: 0.4172910	total: 5.15s	remaining: 37.8s
12:	learn: 0.4102954	total: 5.55s	remaining: 37.2s
13:	learn: 0.4037776	total: 5.95s	remaining: 36.5s
14:	learn: 0.3978591	total: 6.35s	remaining: 36s
15:	learn: 0.3925900	total: 6.76s	remaining: 35.5s
16:	learn: 0.3888215	total: 7.13s	remaining: 34.8s
17:	learn: 0.3846917	total: 7.51s	remaining: 34.2s
18:	learn: 0.3813638	total: 7.92s	remaining: 33.8s
19:	learn: 0.3787115	total: 8.31s	remaining: 

In [6]:
y_pred_proba = model_catboost.predict_proba(X_train)
y_pred_proba

array([[0.0353318 , 0.9646682 ],
       [0.92803095, 0.07196905],
       [0.89878062, 0.10121938],
       ...,
       [0.03559781, 0.96440219],
       [0.03503887, 0.96496113],
       [0.03733237, 0.96266763]])

In [9]:
y_pred_proba[:, 1]

array([0.9646682 , 0.07196905, 0.10121938, ..., 0.96440219, 0.96496113,
       0.96266763])

In [10]:
log_loss(labels.iloc[:, 1], y_pred_proba[:, 1])

0.3275547203148589

In [11]:
y_pred = model_catboost.predict(X_train)
roc_auc_score(labels.iloc[:, 1], y_pred)

0.8570770678441765

In [15]:
model_catboost = PipelineCatBoostClassifier()
model_catboost.fit(X_train, labels.iloc[:, 3])

0:	learn: 0.6633473	total: 353ms	remaining: 34.9s
1:	learn: 0.6407517	total: 927ms	remaining: 45.4s
2:	learn: 0.6196449	total: 1.35s	remaining: 43.6s
3:	learn: 0.6015374	total: 1.83s	remaining: 43.9s
4:	learn: 0.5890602	total: 2.24s	remaining: 42.6s
5:	learn: 0.5819369	total: 2.51s	remaining: 39.4s
6:	learn: 0.5722996	total: 3.03s	remaining: 40.2s
7:	learn: 0.5643059	total: 3.45s	remaining: 39.6s
8:	learn: 0.5580851	total: 3.86s	remaining: 39.1s
9:	learn: 0.5518173	total: 4.28s	remaining: 38.5s
10:	learn: 0.5469030	total: 4.79s	remaining: 38.7s
11:	learn: 0.5434226	total: 5.23s	remaining: 38.4s
12:	learn: 0.5420931	total: 5.32s	remaining: 35.6s
13:	learn: 0.5386344	total: 5.71s	remaining: 35.1s
14:	learn: 0.5366481	total: 6.13s	remaining: 34.7s
15:	learn: 0.5350093	total: 6.33s	remaining: 33.2s
16:	learn: 0.5334380	total: 6.74s	remaining: 32.9s
17:	learn: 0.5316195	total: 6.98s	remaining: 31.8s
18:	learn: 0.5291901	total: 7.39s	remaining: 31.5s
19:	learn: 0.5273287	total: 7.87s	remaini

In [19]:
y_pred_proba = model_catboost.predict_proba(X_train)[:, 1]
log_loss(labels.iloc[:, 3], y_pred_proba)

0.48213934651200135

In [20]:
y_pred = model_catboost.predict(X_train)
roc_auc_score(labels.iloc[:, 3], y_pred)

0.5887324156695058

In [31]:
y_pred_proba = model_catboost.predict_proba(X_train)
log_loss(labels.iloc[:, 3], y_pred_proba)

0.48213934651200135

In [32]:
y_pred = model_catboost.predict(X_train)
roc_auc_score(labels.iloc[:, 3], y_pred)

0.5887324156695058

In [33]:
y_pred_proba = model_catboost.model.predict_proba(X_train)
log_loss(labels.iloc[:, 3], y_pred_proba)

0.4743838280623405

In [47]:
y = labels.iloc[:, 3]
X_train_, X_val, y_train, y_val = train_test_split(X_train, y, test_size=0.2, stratify=y)

In [40]:
X_train_.shape

(6400, 1379)

In [41]:
X_val.shape

(1600, 1379)

In [72]:
model_catboost = PipelineCatBoostClassifier(use_feature_selection=True)
model_catboost.fit(X_train_, y_train)

In [73]:
model_catboost.evaluate_metrics(X_val, y_val, calibrate=True)

{'logloss': 0.507940085324168, 'roc_auc': 0.5515844547418454}

In [74]:
model_catboost.evaluate_metrics(X_val, y_val, calibrate=False)

{'logloss': 0.5069634713497446, 'roc_auc': 0.5316820218293222}

# Подбор гиперпараметров для пайплайна с catboost

Буду подбирать гиперпараметры также по дискретной сетке. Из-за того, что каждый из 14 таргетов распределен по разному и как показывает предыдущий опыт с лог регрессией, следует рассматривать предсказания каждого таргета как отдельную независимую задачу. Так как есть случаи, где довольно хорошо модели справляются, а где наоборот ошибка велика. Поэтому буду делать поиск по сетке для каждого из 14 таргетов отдельно. Есть основания полагать, что параметры могут отличаться в зависимости от случая.

Подбор гиперпараметров по итогу выполнялся в файле train_model.py, так как у меня падал локальный сервер со временем. Видимо из-за того, что обучение шло около суток.

In [None]:
param_distributions = {
    'use_feature_selection': [True, False],  # Использовать ли выбор признаков
    'min_filled_ratio': [0.6],  # Минимальная заполненность для отбора признаков
    'model_params__iterations': [100],
    'model_params__depth': [6, 8],
    'model_params__learning_rate': [0.05, 0.1],
    'model_params__l2_leaf_reg': [3, 5],
    'model_params__border_count': [32, 64],
    'calibration_method': ['isotonic'],  # Способ калибровки
    'cv': [2],  # Количество фолдов для калибровки
}
best_params_for_all_labels = {i:{} for i in range(1, labels.shape[1])}
all_params = []
for i in range(2, labels.shape[1]): # начинаю со второго таргета, т. к. для первого отдельно уже посчитал
    y = labels.iloc[:, i].values
    
    pipeline = PipelineCatBoostClassifier()
    results = custom_grid_search_catboost(X_train, y, pipeline, param_distributions, cv_splits=3)

    # Лучшие параметры
    best_result = results[0]
    best_params_for_all_labels[i] = best_result
    all_params.append(results)
    
    print(f"Лучшие параметры для {i}-го labels:", best_result["params"])
    print(f"Лучший LogLoss для {i}-го labels:", best_result["logloss"])
    print(f"Лучший ROC AUC для {i}-го labels:", best_result["roc_auc"])


0:	learn: 0.6816458	total: 225ms	remaining: 22.3s
50:	learn: 0.4799209	total: 3.35s	remaining: 3.22s
99:	learn: 0.4551581	total: 6.21s	remaining: 0us
0:	learn: 0.6732148	total: 50.4ms	remaining: 4.99s
50:	learn: 0.4741008	total: 2.48s	remaining: 2.39s
99:	learn: 0.4425643	total: 4.88s	remaining: 0us
0:	learn: 0.6827439	total: 28.7ms	remaining: 2.84s
50:	learn: 0.4748318	total: 2.42s	remaining: 2.32s
99:	learn: 0.4392514	total: 5.3s	remaining: 0us
Metrics on 1 iteration on 1 fold: LogLoss: 0.5058697205357288, roc auc: 0.7302188262469118
0:	learn: 0.6767931	total: 76.2ms	remaining: 7.54s
50:	learn: 0.4987913	total: 5.36s	remaining: 5.15s
99:	learn: 0.4816077	total: 9.75s	remaining: 0us
0:	learn: 0.6812582	total: 63.1ms	remaining: 6.25s
50:	learn: 0.4827455	total: 3.35s	remaining: 3.22s
99:	learn: 0.4521627	total: 6.13s	remaining: 0us
0:	learn: 0.6811118	total: 44.7ms	remaining: 4.43s
50:	learn: 0.5035570	total: 2.2s	remaining: 2.12s
99:	learn: 0.4722883	total: 4.65s	remaining: 0us
Metric

In [9]:
y = labels.iloc[:, 1]
X_train_, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2, stratify=y)
pipeline = PipelineCatBoostClassifier()
pipeline.fit(X_train_, y_train)

In [12]:
pipeline.evaluate_metrics(X_test, y_test, calibrate=True)

{'logloss': 0.37654804419464016, 'roc_auc': 0.8366550413417453}

# Заключительный запуск модели на лучших параметрах

In [61]:
current_directory = os.getcwd()
file_name = "best_params.json"  

file_path = os.path.join(current_directory, file_name)

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        best_params = json.load(file)
except FileNotFoundError:
    print(f"Файл {file_name} не найден в директории {current_directory}.")
except json.JSONDecodeError:
    print(f"Ошибка декодирования JSON в файле {file_name}.")

In [51]:
log_ = 0
roc_ = 0
for key in best_params:
    log_ += best_params[key]['logloss']
    roc_ += best_params[key]['roc_auc']
print(f'Mean metrics LogLoss: {log_/len(best_params)}, roc auc: {roc_/len(best_params)}')

Mean metrics LogLoss: 0.2602715349467792, roc auc: 0.6697010860992411


In [62]:
cat_features_train = [col for col in X_train.columns if X_train[col].dtype == 'object']
cat_features_test = [col for col in X_test.columns if X_test[col].dtype == 'object']
int_feat = set(cat_features_train).symmetric_difference(set(cat_features_test))
X_train.drop(columns=list(int_feat), inplace=True)
X_test.drop(columns=list(int_feat), inplace=True)

In [63]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Columns: 1263 entries, id to c_1377
dtypes: float64(343), int64(9), object(911)
memory usage: 77.1+ MB


In [64]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Columns: 1263 entries, id to c_1377
dtypes: float64(343), int64(9), object(911)
memory usage: 19.3+ MB


In [43]:
X_train.shape

(8000, 1265)

In [65]:
y_test = pd.DataFrame()
for i in range(1, labels.shape[1]):
    y = labels.iloc[:, i]
    i_ = str(i)
    params = best_params[i_]['params']
    pipeline = PipelineCatBoostClassifier()
    use_calibrate = params.pop('calibrate')
    pipeline.set_params(**params)
    pipeline.fit(X_train, y)
    #X_test = X_test.apply(pd.to_numeric, errors='coerce')
    #X_test.fillna(-999, inplace=True)
    y_pred_proba = pipeline.predict_proba(X_test, calibrate=use_calibrate)[:, 1]  # Берем вероятности для класса 1
    y_test[f'label_{i}'] = y_pred_proba
y_test

Unnamed: 0,label_1,label_2,label_3,label_4,label_5,label_6,label_7,label_8,label_9,label_10,label_11,label_12,label_13,label_14
0,0.991135,0.616797,0.198929,0.004201,0.047530,0.000000,0.009222,0.053760,0.005125,0.922532,0.917941,0.001918,0.000000,0.068242
1,0.007407,0.032242,0.043002,0.005503,0.028721,0.000000,0.006207,0.055582,0.001121,0.915617,0.878146,0.002734,0.004537,0.029694
2,0.428916,0.611226,0.402529,0.003050,0.025355,0.000000,0.047050,0.213806,0.004908,0.687729,0.656357,0.004688,0.001168,0.270554
3,0.200971,0.139814,0.232029,0.003049,0.025595,0.000000,0.051157,0.260321,0.002627,0.908421,0.777787,0.019499,0.008297,0.201275
4,0.978204,0.634903,0.465307,0.000197,0.012125,0.000000,0.105810,0.189153,0.003080,0.645984,0.543574,0.000414,0.000000,0.005513
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,0.214937,0.083854,0.064340,0.000450,0.048901,0.432842,0.146792,0.136639,0.001631,0.937368,0.818549,0.000000,0.000000,0.011436
1996,0.784689,0.577925,0.312704,0.002103,0.006184,0.198887,0.082090,0.167039,0.002843,0.888711,0.786365,0.021955,0.008297,0.074660
1997,0.216313,0.285012,0.284650,0.001878,0.009927,0.000000,0.065595,0.211264,0.003960,0.908229,0.803068,0.000677,0.000000,0.432582
1998,0.216313,0.236765,0.188298,0.000295,0.016466,0.000000,0.022662,0.164497,0.003267,0.895656,0.782291,0.000000,0.000000,0.003137


# Сохранение результатов

In [66]:
results_dir = Path(PROJECT_DIR, 'results')
y_test.to_csv(results_dir / 'y_catboost.csv', index=False)

Отмечу то, что можно было бы сделать сетку подробнее и увеличить немного качество, но подбор тогда был бы очень долгий.