In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import optuna
from optuna.samplers import TPESampler

from sklearn.exceptions import ConvergenceWarning
import warnings



# Отключаем предупреждения о сходимости
warnings.filterwarnings("ignore", category=ConvergenceWarning)

In [None]:
df = pd.read_csv('data/clustered_df.csv', index_col=0)
df

Модели бустинга для каждого кластера, первый таргет

In [None]:
results = {}
for i in range(6):
    print(f"\n=== Сегмент {i + 1} ===")
    df_seg = df[df['cluster'] == i].copy()
    X = df_seg.drop(columns=['cluster', 'Target1_Yes', 'Target2_Yes'])
    y = df_seg['Target1_Yes']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 600),
            'max_depth': trial.suggest_int('max_depth', 3, 14),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.4),
            'subsample': trial.suggest_float('subsample', 0.45, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.45, 1.0),
            'use_label_encoder': False,
            'eval_metric': 'logloss',
            'verbosity': 0,
        }
        model = XGBClassifier(**params)
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        scores = cross_val_score(
            model, X_train, y_train,
            cv=cv, scoring='f1', n_jobs=-1
        )
        return scores.mean()

    study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=60, show_progress_bar=True)
    best = study.best_params
    print("Лучшие гиперпараметры XGB по F1:", best)

    xgb = XGBClassifier(**best, use_label_encoder=False, eval_metric='logloss', verbosity=0)
    xgb.fit(X_train, y_train)
    xgb_pred = xgb.predict(X_test)

    cb = CatBoostClassifier(
        iterations=best['n_estimators'],
        max_depth=best['max_depth'],
        learning_rate=best['learning_rate'],
        subsample=best['subsample'],
        colsample_bylevel=best['colsample_bytree'],
        auto_class_weights='Balanced',
        verbose=False
    )
    cb.fit(X_train, y_train)
    cb_pred = cb.predict(X_test)


    lgb = LGBMClassifier(
        n_estimators=best['n_estimators'],
        max_depth=best['max_depth'],
        learning_rate=best['learning_rate'],
        subsample=best['subsample'],
        colsample_bytree=best['colsample_bytree'],
        class_weight='balanced'
    )
    lgb.fit(X_train, y_train)
    lgb_pred = lgb.predict(X_test)

    for name, pred in [('XGBoost', xgb_pred), ('CatBoost', cb_pred), ('LightGBM', lgb_pred)]:
        print(f"\n{name}:")
        print(f"\tF1      = {f1_score(y_test, pred):.4f}")
        print(f"\tAccuracy= {accuracy_score(y_test, pred):.4f}")
        print(f"\tROC AUC = {roc_auc_score(y_test, pred):.4f}")
        results[f'{i}_{name}'] =  f'f1 - {f1_score(y_test, pred):.4f}, Accuracy - {accuracy_score(y_test, pred):.4f}, ROC_AUC - {roc_auc_score(y_test, pred):.4f}'


In [None]:
results

Модели бустинга для каждого кластера, второй таргет

In [None]:
results_2 = {}

for i in range(6):
    print(f"\n=== Сегмент {i + 1} ===")
    df_seg = df[df['cluster'] == i].copy()
    X = df_seg.drop(columns=['cluster', 'Target1_Yes', 'Target2_Yes'])
    y = df_seg['Target2_Yes']

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )

    def objective(trial):
        params = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 600),
            'max_depth': trial.suggest_int('max_depth', 3, 14),
            'learning_rate': trial.suggest_loguniform('learning_rate', 1e-3, 0.4),
            'subsample': trial.suggest_float('subsample', 0.45, 1.0),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.45, 1.0),
            'use_label_encoder': False,
            'eval_metric': 'logloss',
            'verbosity': 0,
        }
        model = XGBClassifier(**params)
        cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
        scores = cross_val_score(
            model, X_train, y_train,
            cv=cv, scoring='f1', n_jobs=-1
        )
        return scores.mean()

    study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=42))
    study.optimize(objective, n_trials=60, show_progress_bar=True)
    best = study.best_params
    print("Лучшие гиперпараметры XGB по F1:", best)

    xgb = XGBClassifier(**best, use_label_encoder=False, eval_metric='logloss', verbosity=0)
    xgb.fit(X_train, y_train)
    xgb_pred = xgb.predict(X_test)

    cb = CatBoostClassifier(
        iterations=best['n_estimators'],
        max_depth=best['max_depth'],
        learning_rate=best['learning_rate'],
        subsample=best['subsample'],
        colsample_bylevel=best['colsample_bytree'],
        auto_class_weights='Balanced',
        verbose=False
    )
    cb.fit(X_train, y_train)
    cb_pred = cb.predict(X_test)


    lgb = LGBMClassifier(
        n_estimators=best['n_estimators'],
        max_depth=best['max_depth'],
        learning_rate=best['learning_rate'],
        subsample=best['subsample'],
        colsample_bytree=best['colsample_bytree'],
        class_weight='balanced'
    )
    lgb.fit(X_train, y_train)
    lgb_pred = lgb.predict(X_test)

    for name, pred in [('XGBoost', xgb_pred), ('CatBoost', cb_pred), ('LightGBM', lgb_pred)]:
        print(f"\n{name}:")
        print(f"\tF1      = {f1_score(y_test, pred):.4f}")
        print(f"\tAccuracy= {accuracy_score(y_test, pred):.4f}")
        print(f"\tROC AUC = {roc_auc_score(y_test, pred):.4f}")
        results_2[f'{i}_{name}'] =  f'f1 - {f1_score(y_test, pred):.4f}, Accuracy - {accuracy_score(y_test, pred):.4f}, ROC_AUC - {roc_auc_score(y_test, pred):.4f}'



In [37]:
results_2

{'0_XGBoost': 'f1 - 0.7550, Accuracy - 0.9630, ROC_AUC - 0.8169',
 '0_CatBoost': 'f1 - 0.6133, Accuracy - 0.8931, ROC_AUC - 0.9184',
 '0_LightGBM': 'f1 - 0.6232, Accuracy - 0.8990, ROC_AUC - 0.9156',
 '1_XGBoost': 'f1 - 0.7942, Accuracy - 0.9490, ROC_AUC - 0.8627',
 '1_CatBoost': 'f1 - 0.7166, Accuracy - 0.9038, ROC_AUC - 0.9112',
 '1_LightGBM': 'f1 - 0.7221, Accuracy - 0.9065, ROC_AUC - 0.9122',
 '2_XGBoost': 'f1 - 0.6795, Accuracy - 0.9774, ROC_AUC - 0.7833',
 '2_CatBoost': 'f1 - 0.4955, Accuracy - 0.9244, ROC_AUC - 0.9063',
 '2_LightGBM': 'f1 - 0.4986, Accuracy - 0.9269, ROC_AUC - 0.8987',
 '3_XGBoost': 'f1 - 0.8846, Accuracy - 0.9523, ROC_AUC - 0.9312',
 '3_CatBoost': 'f1 - 0.8575, Accuracy - 0.9337, ROC_AUC - 0.9499',
 '3_LightGBM': 'f1 - 0.8579, Accuracy - 0.9337, ROC_AUC - 0.9509',
 '4_XGBoost': 'f1 - 0.8057, Accuracy - 0.9816, ROC_AUC - 0.8514',
 '4_CatBoost': 'f1 - 0.7072, Accuracy - 0.9602, ROC_AUC - 0.9273',
 '4_LightGBM': 'f1 - 0.7422, Accuracy - 0.9672, ROC_AUC - 0.9235',


Рандомный лес и логит для каждого кластера, первый таргет 

In [None]:
results = []

for cluster_id in sorted(df['cluster'].unique()):
    print(f"\n=== Кластер {cluster_id} ===")
    df_seg = df[df['cluster'] == cluster_id]
    
    X = df_seg.drop(columns=['cluster', 'Target1_Yes', 'Target2_Yes'])
    y = df_seg['Target1_Yes']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    rf = RandomForestClassifier(random_state=42)
    param_grid_rf = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10],
        'max_features': ['sqrt'],
        'min_samples_split': [2]
    }
    gs_rf = GridSearchCV(rf, param_grid_rf, cv=cv, scoring='f1', n_jobs=-1)
    gs_rf.fit(X_train, y_train)
    best_rf = gs_rf.best_estimator_
    rf_pred = best_rf.predict(X_test)
    print("RF best params:", gs_rf.best_params_)
    print("RF F1      =", f1_score(y_test, rf_pred))
    print("RF Accuracy=", accuracy_score(y_test, rf_pred))
    print("RF ROC AUC =", roc_auc_score(y_test, rf_pred))
    

    lr = LogisticRegression(max_iter=1000, random_state=42)
    param_grid_lr = {
        'C': [0.1, 1],
        'penalty': ['l2'],
        'solver': ['liblinear']
    }
    gs_lr = GridSearchCV(lr, param_grid_lr, cv=cv, scoring='f1', n_jobs=-1)
    gs_lr.fit(X_train, y_train)
    best_lr = gs_lr.best_estimator_
    lr_pred = best_lr.predict(X_test)
    print("LR best params:", gs_lr.best_params_)
    print("LR F1      =", f1_score(y_test, lr_pred))
    print("LR Accuracy=", accuracy_score(y_test, lr_pred))
    print("LR ROC AUC =", roc_auc_score(y_test, lr_pred))
    
    results.append({
        'cluster': cluster_id,
        'rf_params': gs_rf.best_params_,
        'rf_f1': f1_score(y_test, rf_pred),
        'lr_params': gs_lr.best_params_,
        'lr_f1': f1_score(y_test, lr_pred)
    })

results_df = pd.DataFrame(results)
print("\n=== Сводная таблица результатов ===")
print(results_df)



=== Кластер 0 ===
RF best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 100}
RF F1      = 0.7237665550865958
RF Accuracy= 0.861323201695101
RF ROC AUC = 0.787251927377479
LR best params: {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}
LR F1      = 0.6123024830699775
LR Accuracy= 0.7992182077229386
LR ROC AUC = 0.7211572967763119

=== Кластер 1 ===
RF best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
RF F1      = 0.7802435192458759
RF Accuracy= 0.8612265145408321
RF ROC AUC = 0.8254996285168799
LR best params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
LR F1      = 0.6710422683118928
LR Accuracy= 0.8057605258262541
LR ROC AUC = 0.7506382585572594

=== Кластер 2 ===
RF best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
RF F1      = 0.7182206129574551
RF Accuracy= 0.9471565417727659
RF ROC AUC = 0.7914797514994889
LR best params: {'

In [40]:
results_df

Unnamed: 0,cluster,rf_params,rf_f1,lr_params,lr_f1
0,0,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.723767,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",0.612302
1,1,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.780244,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline...",0.671042
2,2,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.718221,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",0.31249
3,3,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.876555,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline...",0.834435
4,4,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.776898,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",0.274038
5,5,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.675252,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline...",0.351662


Рандомный лес и логит для каждого кластера, второй таргет 

In [42]:
results = []

for cluster_id in sorted(df['cluster'].unique()):
    print(f"\n=== Кластер {cluster_id} ===")
    df_seg = df[df['cluster'] == cluster_id]
    
    X = df_seg.drop(columns=['cluster', 'Target1_Yes', 'Target2_Yes'])
    y = df_seg['Target2_Yes']
    
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=42
    )
    
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    # 1) Random Forest (сокращённая сетка)
    rf = RandomForestClassifier(random_state=42)
    param_grid_rf = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10],
        'max_features': ['sqrt'],
        'min_samples_split': [2]
    }
    gs_rf = GridSearchCV(rf, param_grid_rf, cv=cv, scoring='f1', n_jobs=-1)
    gs_rf.fit(X_train, y_train)
    best_rf = gs_rf.best_estimator_
    rf_pred = best_rf.predict(X_test)
    print("RF best params:", gs_rf.best_params_)
    print("RF F1      =", f1_score(y_test, rf_pred))
    print("RF Accuracy=", accuracy_score(y_test, rf_pred))
    print("RF ROC AUC =", roc_auc_score(y_test, rf_pred))
    
    # 2) Logistic Regression (сокращённая сетка)
    lr = LogisticRegression(max_iter=1000, random_state=42)
    param_grid_lr = {
        'C': [0.1, 1],
        'penalty': ['l2'],
        'solver': ['liblinear']
    }
    gs_lr = GridSearchCV(lr, param_grid_lr, cv=cv, scoring='f1', n_jobs=-1)
    gs_lr.fit(X_train, y_train)
    best_lr = gs_lr.best_estimator_
    lr_pred = best_lr.predict(X_test)
    print("LR best params:", gs_lr.best_params_)
    print("LR F1      =", f1_score(y_test, lr_pred))
    print("LR Accuracy=", accuracy_score(y_test, lr_pred))
    print("LR ROC AUC =", roc_auc_score(y_test, lr_pred))
    
    results.append({
        'cluster': cluster_id,
        'rf_params': gs_rf.best_params_,
        'rf_f1': f1_score(y_test, rf_pred),
        'lr_params': gs_lr.best_params_,
        'lr_f1': f1_score(y_test, lr_pred)
    })

results_df = pd.DataFrame(results)
print("\n=== Сводная таблица результатов ===")
print(results_df)



=== Кластер 0 ===
RF best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
RF F1      = 0.688824914943732
RF Accuracy= 0.9565630365688818
RF ROC AUC = 0.7680072945627119
LR best params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
LR F1      = 0.010462776659959759
LR Accuracy= 0.9101669528367369
LR ROC AUC = 0.502097979538485

=== Кластер 1 ===
RF best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
RF F1      = 0.7644828037138747
RF Accuracy= 0.9441619644075153
RF ROC AUC = 0.8347876271944257
LR best params: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
LR F1      = 0.009092975676290065
LR Accuracy= 0.8648539716004217
LR ROC AUC = 0.5002219623856353

=== Кластер 2 ===
RF best params: {'max_depth': None, 'max_features': 'sqrt', 'min_samples_split': 2, 'n_estimators': 200}
RF F1      = 0.4755186721991701
RF Accuracy= 0.9693464290044865
RF ROC AUC = 0.664502491824895
LR best param

In [47]:
results_df

Unnamed: 0,cluster,rf_params,rf_f1,lr_params,lr_f1
0,0,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.688825,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline...",0.010463
1,1,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.764483,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline...",0.009093
2,2,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.475519,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline...",0.001156
3,3,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.834754,"{'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}",0.018735
4,4,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.332717,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline...",0.0
5,5,"{'max_depth': None, 'max_features': 'sqrt', 'm...",0.605217,"{'C': 0.1, 'penalty': 'l2', 'solver': 'libline...",0.0
