# HW06: Деревья решений и ансамбли

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import joblib
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.inspection import permutation_importance
import warnings
warnings.filterwarnings('ignore')
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)
Path('artifacts/figures').mkdir(parents=True, exist_ok=True)
print('OK')

## Загрузка данных

In [None]:
df = pd.read_csv('S06-hw-dataset-01.csv')
print(f'Размер: {df.shape}')
df.head()

In [None]:
df.info()

In [None]:
print('Пропуски:', df.isnull().sum().sum())
print('\nРаспределение target:')
print(df['target'].value_counts(normalize=True))

In [None]:
plt.figure(figsize=(8,5))
df['target'].value_counts().plot(kind='bar')
plt.title('Распределение target')
plt.savefig('artifacts/figures/target_distribution.png', dpi=100)
plt.show()

## Train/Test Split

In [None]:
feature_cols = [col for col in df.columns if col not in ['id', 'target']]
X = df[feature_cols]
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y)
print(f'Train: {X_train.shape}, Test: {X_test.shape}')

## Baseline

In [None]:
dummy_clf = DummyClassifier(strategy='most_frequent', random_state=RANDOM_STATE)
dummy_clf.fit(X_train, y_train)
y_pred_dummy = dummy_clf.predict(X_test)
dummy_acc = accuracy_score(y_test, y_pred_dummy)
dummy_f1 = f1_score(y_test, y_pred_dummy)
print(f'Dummy - Acc: {dummy_acc:.4f}, F1: {dummy_f1:.4f}')

In [None]:
lr_pipeline = Pipeline([('scaler', StandardScaler()), ('lr', LogisticRegression(random_state=RANDOM_STATE, max_iter=1000))])
lr_pipeline.fit(X_train, y_train)
y_pred_lr = lr_pipeline.predict(X_test)
y_proba_lr = lr_pipeline.predict_proba(X_test)[:, 1]
lr_acc = accuracy_score(y_test, y_pred_lr)
lr_f1 = f1_score(y_test, y_pred_lr)
lr_auc = roc_auc_score(y_test, y_proba_lr)
print(f'LogReg - Acc: {lr_acc:.4f}, F1: {lr_f1:.4f}, AUC: {lr_auc:.4f}')

## Decision Tree

In [None]:
dt_param_grid = {'max_depth': [3, 5, 7, 10, None], 'min_samples_leaf': [1, 5, 10, 20], 'min_samples_split': [2, 10, 20]}
dt_clf = DecisionTreeClassifier(random_state=RANDOM_STATE)
dt_grid = GridSearchCV(dt_clf, dt_param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
dt_grid.fit(X_train, y_train)
print(f'Best params: {dt_grid.best_params_}')
print(f'Best CV AUC: {dt_grid.best_score_:.4f}')
best_dt = dt_grid.best_estimator_
y_pred_dt = best_dt.predict(X_test)
y_proba_dt = best_dt.predict_proba(X_test)[:, 1]
dt_acc = accuracy_score(y_test, y_pred_dt)
dt_f1 = f1_score(y_test, y_pred_dt)
dt_auc = roc_auc_score(y_test, y_proba_dt)
print(f'DT Test - Acc: {dt_acc:.4f}, F1: {dt_f1:.4f}, AUC: {dt_auc:.4f}')

## Random Forest

In [None]:
rf_param_grid = {'n_estimators': [50, 100, 200], 'max_depth': [5, 10, 15, None], 'min_samples_leaf': [1, 5, 10], 'max_features': ['sqrt', 'log2']}
rf_clf = RandomForestClassifier(random_state=RANDOM_STATE)
rf_grid = GridSearchCV(rf_clf, rf_param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
rf_grid.fit(X_train, y_train)
print(f'Best params: {rf_grid.best_params_}')
print(f'Best CV AUC: {rf_grid.best_score_:.4f}')
best_rf = rf_grid.best_estimator_
y_pred_rf = best_rf.predict(X_test)
y_proba_rf = best_rf.predict_proba(X_test)[:, 1]
rf_acc = accuracy_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)
rf_auc = roc_auc_score(y_test, y_proba_rf)
print(f'RF Test - Acc: {rf_acc:.4f}, F1: {rf_f1:.4f}, AUC: {rf_auc:.4f}')

## Gradient Boosting

In [None]:
gb_param_grid = {'n_estimators': [50, 100, 150], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7], 'min_samples_leaf': [1, 5, 10]}
gb_clf = GradientBoostingClassifier(random_state=RANDOM_STATE)
gb_grid = GridSearchCV(gb_clf, gb_param_grid, cv=5, scoring='roc_auc', n_jobs=-1, verbose=1)
gb_grid.fit(X_train, y_train)
print(f'Best params: {gb_grid.best_params_}')
print(f'Best CV AUC: {gb_grid.best_score_:.4f}')
best_gb = gb_grid.best_estimator_
y_pred_gb = best_gb.predict(X_test)
y_proba_gb = best_gb.predict_proba(X_test)[:, 1]
gb_acc = accuracy_score(y_test, y_pred_gb)
gb_f1 = f1_score(y_test, y_pred_gb)
gb_auc = roc_auc_score(y_test, y_proba_gb)
print(f'GB Test - Acc: {gb_acc:.4f}, F1: {gb_f1:.4f}, AUC: {gb_auc:.4f}')

## Stacking

In [None]:
estimators = [('dt', best_dt), ('rf', best_rf), ('gb', best_gb)]
stacking_clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(random_state=RANDOM_STATE), cv=5)
stacking_clf.fit(X_train, y_train)
y_pred_stack = stacking_clf.predict(X_test)
y_proba_stack = stacking_clf.predict_proba(X_test)[:, 1]
stack_acc = accuracy_score(y_test, y_pred_stack)
stack_f1 = f1_score(y_test, y_pred_stack)
stack_auc = roc_auc_score(y_test, y_proba_stack)
print(f'Stacking Test - Acc: {stack_acc:.4f}, F1: {stack_f1:.4f}, AUC: {stack_auc:.4f}')

## Сравнение моделей

In [None]:
results = {'Model': ['Dummy', 'LogisticRegression', 'DecisionTree', 'RandomForest', 'GradientBoosting', 'Stacking'], 'Accuracy': [dummy_acc, lr_acc, dt_acc, rf_acc, gb_acc, stack_acc], 'F1-score': [dummy_f1, lr_f1, dt_f1, rf_f1, gb_f1, stack_f1], 'ROC-AUC': [0, lr_auc, dt_auc, rf_auc, gb_auc, stack_auc]}
results_df = pd.DataFrame(results).sort_values('ROC-AUC', ascending=False)
print(results_df.to_string(index=False))
best_model_name = results_df.iloc[0]['Model']
print(f'\nЛучшая модель: {best_model_name}')

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for idx, metric in enumerate(['Accuracy', 'F1-score', 'ROC-AUC']):
    data = results_df.sort_values(metric, ascending=True)
    axes[idx].barh(data['Model'], data[metric])
    axes[idx].set_xlabel(metric)
    axes[idx].set_title(metric)
    axes[idx].grid(axis='x', alpha=0.3)
plt.tight_layout()
plt.savefig('artifacts/figures/models_comparison.png', dpi=100)
plt.show()

## ROC-кривые

In [None]:
plt.figure(figsize=(10, 8))
for name, y_proba, auc in [('LogReg', y_proba_lr, lr_auc), ('DT', y_proba_dt, dt_auc), ('RF', y_proba_rf, rf_auc), ('GB', y_proba_gb, gb_auc), ('Stack', y_proba_stack, stack_auc)]:
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.plot(fpr, tpr, label=f'{name} (AUC={auc:.3f})', linewidth=2)
plt.plot([0, 1], [0, 1], 'k--', label='Random', linewidth=1)
plt.xlabel('FPR')
plt.ylabel('TPR')
plt.title('ROC Curves')
plt.legend()
plt.grid(alpha=0.3)
plt.savefig('artifacts/figures/roc_curves.png', dpi=100)
plt.show()

## Confusion Matrix

In [None]:
model_preds = {'DecisionTree': y_pred_dt, 'RandomForest': y_pred_rf, 'GradientBoosting': y_pred_gb, 'Stacking': y_pred_stack}
best_pred = model_preds.get(best_model_name, y_pred_gb)
cm = confusion_matrix(y_test, best_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
fig, ax = plt.subplots(figsize=(8, 6))
disp.plot(ax=ax, cmap='Blues', values_format='d')
plt.title(f'Confusion Matrix: {best_model_name}')
plt.savefig('artifacts/figures/confusion_matrix.png', dpi=100)
plt.show()
print(classification_report(y_test, best_pred))

## Permutation Importance

In [None]:
best_models = {'DecisionTree': best_dt, 'RandomForest': best_rf, 'GradientBoosting': best_gb, 'Stacking': stacking_clf}
best_model = best_models.get(best_model_name, best_gb)
perm_imp = permutation_importance(best_model, X_test, y_test, n_repeats=10, random_state=RANDOM_STATE, scoring='roc_auc')
sorted_idx = perm_imp.importances_mean.argsort()[::-1]
top_n = min(15, len(sorted_idx))
plt.figure(figsize=(10, 8))
plt.barh(range(top_n), perm_imp.importances_mean[sorted_idx[:top_n]][::-1], xerr=perm_imp.importances_std[sorted_idx[:top_n]][::-1])
plt.yticks(range(top_n), [feature_cols[i] for i in sorted_idx[:top_n]][::-1])
plt.xlabel('Permutation Importance')
plt.title(f'Top-{top_n} Features ({best_model_name})')
plt.grid(axis='x', alpha=0.3)
plt.savefig('artifacts/figures/feature_importance.png', dpi=100)
plt.show()
for i in range(top_n):
    idx = sorted_idx[i]
    print(f'{i+1}. {feature_cols[idx]}: {perm_imp.importances_mean[idx]:.4f}')

## Сохранение артефактов

In [None]:
with open('artifacts/metrics_test.json', 'w', encoding='utf-8') as f:
    json.dump(results_df.to_dict('records'), f, indent=2, ensure_ascii=False)
search_summaries = {'DecisionTree': {'best_params': dt_grid.best_params_, 'best_cv_score': float(dt_grid.best_score_)}, 'RandomForest': {'best_params': rf_grid.best_params_, 'best_cv_score': float(rf_grid.best_score_)}, 'GradientBoosting': {'best_params': gb_grid.best_params_, 'best_cv_score': float(gb_grid.best_score_)}}
with open('artifacts/search_summaries.json', 'w', encoding='utf-8') as f:
    json.dump(search_summaries, f, indent=2, ensure_ascii=False)
joblib.dump(best_model, 'artifacts/best_model.joblib')
best_model_meta = {'model_name': best_model_name, 'best_params': search_summaries.get(best_model_name, {}).get('best_params', {}), 'test_metrics': {'accuracy': float(results_df[results_df['Model'] == best_model_name]['Accuracy'].values[0]), 'f1_score': float(results_df[results_df['Model'] == best_model_name]['F1-score'].values[0]), 'roc_auc': float(results_df[results_df['Model'] == best_model_name]['ROC-AUC'].values[0])}, 'dataset': 'S06-hw-dataset-01.csv', 'random_state': RANDOM_STATE}
with open('artifacts/best_model_meta.json', 'w', encoding='utf-8') as f:
    json.dump(best_model_meta, f, indent=2, ensure_ascii=False)
print('Все артефакты сохранены!')