# 🤖 04_Hyperparameter_Tuning — Оптимізація гіперпараметрів
# Project: Прогнозування Відтоку Клієнтів
# Team Lead: Vitalii Subbotin
# Date: 2025-10-15

In [1]:
# 1. Імпорт бібліотек і налаштування середовища
import os
import warnings
import time
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")


from pathlib import Path
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score


# Models
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


In [2]:
# 2. Налаштування шляхів і констант
BASE_DIR = Path("..")
DATA_DIR = BASE_DIR / "data"
MODELS_DIR = BASE_DIR / "models"
MODELS_DIR.mkdir(parents=True, exist_ok=True)


DATA_PATH = DATA_DIR / "df1_mean.csv"
RANDOM_STATE = 42

print("DATA_PATH ->", DATA_PATH)


DATA_PATH -> ..\data\df1_mean.csv


In [4]:
# 3. Завантаження даних
df = pd.read_csv(DATA_PATH)
print("Розмір датасету:", df.shape)


if 'id' in df.columns:
    df = df.drop(columns=['id'])
if 'churn' not in df.columns and 'Churn' in df.columns:
    df.rename(columns={'Churn': 'churn'}, inplace=True)


X = df.drop(columns=['churn'])
y = df['churn']


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=RANDOM_STATE
)


# Масштабування
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

Розмір датасету: (71893, 12)


In [5]:
# 4. GridSearchCV для LightGBM
param_grid_lgb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7, -1],
    'num_leaves': [15, 31, 63],
    'subsample': [0.8, 1.0]
}


grid_lgb = GridSearchCV(
    estimator=LGBMClassifier(random_state=RANDOM_STATE, force_row_wise=True, verbose=-1),
    param_grid=param_grid_lgb,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=2
)


grid_lgb.fit(X_train_scaled, y_train)
print("\n✅ Найкращі параметри LightGBM:", grid_lgb.best_params_)
print("ROC-AUC:", grid_lgb.best_score_)

Fitting 3 folds for each of 216 candidates, totalling 648 fits


  File "C:\Users\User\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
               ^^^^^^^^^^^^^^^
  File "C:\Users\User\anaconda3\Lib\subprocess.py", line 548, in run
    with Popen(*popenargs, **kwargs) as process:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\User\anaconda3\Lib\subprocess.py", line 1026, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\User\anaconda3\Lib\subprocess.py", line 1538, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,
                       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^



✅ Найкращі параметри LightGBM: {'learning_rate': 0.05, 'max_depth': -1, 'n_estimators': 300, 'num_leaves': 63, 'subsample': 0.8}
ROC-AUC: 0.9819942635691806


In [6]:
# 5. GridSearchCV для XGBoost
param_grid_xgb = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.8, 1.0]
}


grid_xgb = GridSearchCV(
    estimator=XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=RANDOM_STATE),
    param_grid=param_grid_xgb,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=2
)


grid_xgb.fit(X_train_scaled, y_train)
print("\n✅ Найкращі параметри XGBoost:", grid_xgb.best_params_)
print("ROC-AUC:", grid_xgb.best_score_)

Fitting 3 folds for each of 24 candidates, totalling 72 fits

✅ Найкращі параметри XGBoost: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 200, 'subsample': 1.0}
ROC-AUC: 0.9813205786271485


In [7]:
# 6. GridSearchCV для RandomForest
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}


grid_rf = GridSearchCV(
    estimator=RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1),
    param_grid=param_grid_rf,
    scoring='roc_auc',
    cv=3,
    n_jobs=-1,
    verbose=2
)


grid_rf.fit(X_train_scaled, y_train)
print("\n✅ Найкращі параметри RandomForest:", grid_rf.best_params_)
print("ROC-AUC:", grid_rf.best_score_)

Fitting 3 folds for each of 36 candidates, totalling 108 fits

✅ Найкращі параметри RandomForest: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 300}
ROC-AUC: 0.9807987269880069


In [8]:
# 7. Порівняння результатів
tuning_results = pd.DataFrame([
    {"model": "LightGBM", "best_params": grid_lgb.best_params_, "best_score": grid_lgb.best_score_},
    {"model": "XGBoost", "best_params": grid_xgb.best_params_, "best_score": grid_xgb.best_score_},
    {"model": "RandomForest", "best_params": grid_rf.best_params_, "best_score": grid_rf.best_score_}
])


print("\n📊 Результати оптимізації:")
display(tuning_results)


tuning_results.to_csv(MODELS_DIR / "tuning_summary.csv", index=False)
print("✅ Збережено tuning_summary.csv у папці models")


📊 Результати оптимізації:


Unnamed: 0,model,best_params,best_score
0,LightGBM,"{'learning_rate': 0.05, 'max_depth': -1, 'n_es...",0.981994
1,XGBoost,"{'learning_rate': 0.1, 'max_depth': 7, 'n_esti...",0.981321
2,RandomForest,"{'max_depth': None, 'min_samples_leaf': 2, 'mi...",0.980799


✅ Збережено tuning_summary.csv у папці models


In [9]:
# 8. Висновки
print("\n📌 Висновок:")
print(" - Проведено GridSearchCV для трьох моделей (LightGBM, XGBoost, RandomForest)")
print(" - Найкраща модель за ROC-AUC:", tuning_results.sort_values(by='best_score', ascending=False).iloc[0]['model'])
print(" - Результати збережено для подальшого використання")


📌 Висновок:
 - Проведено GridSearchCV для трьох моделей (LightGBM, XGBoost, RandomForest)
 - Найкраща модель за ROC-AUC: LightGBM
 - Результати збережено для подальшого використання
