# Ensemble Methods Pipeline - Методы ансамблирования

Пайплайны для:
- Voting & Averaging
- Stacking
- Blending
- Weighted ensembles
- Out-of-fold predictions

In [None]:
!pip install pandas numpy scikit-learn catboost xgboost lightgbm mlxtend -q

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, roc_auc_score, mean_squared_error
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
import xgboost as xgb
import lightgbm as lgb
import catboost as cb
from mlxtend.classifier import StackingClassifier
import warnings
warnings.filterwarnings('ignore')

print("✓ Библиотеки загружены!")

## 1. Загрузка данных

In [None]:
# === ВАШИ ДАННЫЕ ===
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

TARGET_COL = 'target'
ID_COL = 'id'

# Подготовка данных
feature_cols = [col for col in train_df.columns if col not in [TARGET_COL, ID_COL]]
X = train_df[feature_cols]
y = train_df[TARGET_COL]
X_test = test_df[feature_cols]

# Train/Val split
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")

## 2. Базовые модели

In [None]:
# Создание базовых моделей
models = {
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
    'XGBoost': xgb.XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1),
    'LightGBM': lgb.LGBMClassifier(n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1),
    'CatBoost': cb.CatBoostClassifier(iterations=100, learning_rate=0.1, random_state=42, verbose=0)
}

# Обучение и оценка базовых моделей
base_predictions_val = {}
base_predictions_test = {}
base_scores = {}

for name, model in models.items():
    print(f"\nОбучение {name}...")
    model.fit(X_train, y_train)
    
    # Предсказания
    val_pred = model.predict_proba(X_val)[:, 1]
    test_pred = model.predict_proba(X_test)[:, 1]
    
    base_predictions_val[name] = val_pred
    base_predictions_test[name] = test_pred
    
    # Оценка
    score = roc_auc_score(y_val, val_pred)
    base_scores[name] = score
    print(f"{name} Validation AUC: {score:.6f}")

print("\n✓ Базовые модели обучены!")

## 3. Simple Averaging

In [None]:
# Простое усреднение
avg_val_pred = np.mean(list(base_predictions_val.values()), axis=0)
avg_test_pred = np.mean(list(base_predictions_test.values()), axis=0)

avg_score = roc_auc_score(y_val, avg_val_pred)
print(f"Simple Averaging Validation AUC: {avg_score:.6f}")

## 4. Weighted Averaging

In [None]:
# Взвешенное усреднение на основе производительности моделей
total_score = sum(base_scores.values())
weights = {name: score / total_score for name, score in base_scores.items()}

print("\nВеса моделей:")
for name, weight in weights.items():
    print(f"{name}: {weight:.4f}")

# Применение весов
weighted_val_pred = sum(weights[name] * base_predictions_val[name] for name in models.keys())
weighted_test_pred = sum(weights[name] * base_predictions_test[name] for name in models.keys())

weighted_score = roc_auc_score(y_val, weighted_val_pred)
print(f"\nWeighted Averaging Validation AUC: {weighted_score:.6f}")

## 5. Voting Classifier

In [None]:
# Voting Ensemble (soft voting - усреднение вероятностей)
voting_clf = VotingClassifier(
    estimators=[
        ('rf', models['RandomForest']),
        ('xgb', models['XGBoost']),
        ('lgb', models['LightGBM'])
    ],
    voting='soft'  # 'hard' для голосования по классам
)

voting_clf.fit(X_train, y_train)
voting_val_pred = voting_clf.predict_proba(X_val)[:, 1]
voting_test_pred = voting_clf.predict_proba(X_test)[:, 1]

voting_score = roc_auc_score(y_val, voting_val_pred)
print(f"Voting Classifier Validation AUC: {voting_score:.6f}")

## 6. Stacking - Out-of-Fold predictions

In [None]:
def get_oof_predictions(models_dict, X_train, y_train, X_test, n_folds=5):
    """
    Получить out-of-fold предсказания для стекинга
    """
    kf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)
    
    # Массивы для OOF предсказаний
    oof_train = np.zeros((len(X_train), len(models_dict)))
    oof_test = np.zeros((len(X_test), len(models_dict)))
    
    for model_idx, (name, model) in enumerate(models_dict.items()):
        print(f"\nProcessing {name}...")
        oof_test_skf = np.zeros((len(X_test), n_folds))
        
        for fold_idx, (train_idx, val_idx) in enumerate(kf.split(X_train, y_train)):
            X_tr = X_train.iloc[train_idx]
            y_tr = y_train.iloc[train_idx]
            X_val = X_train.iloc[val_idx]
            
            # Обучение
            if name == 'CatBoost':
                model_fold = cb.CatBoostClassifier(
                    iterations=100, learning_rate=0.1, random_state=42, verbose=0
                )
            elif name == 'XGBoost':
                model_fold = xgb.XGBClassifier(
                    n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1
                )
            elif name == 'LightGBM':
                model_fold = lgb.LGBMClassifier(
                    n_estimators=100, learning_rate=0.1, random_state=42, n_jobs=-1, verbose=-1
                )
            else:
                model_fold = RandomForestClassifier(
                    n_estimators=100, random_state=42, n_jobs=-1
                )
            
            model_fold.fit(X_tr, y_tr)
            
            # OOF предсказания для train
            oof_train[val_idx, model_idx] = model_fold.predict_proba(X_val)[:, 1]
            
            # Предсказания для test
            oof_test_skf[:, fold_idx] = model_fold.predict_proba(X_test)[:, 1]
        
        # Усреднение предсказаний по фолдам для test
        oof_test[:, model_idx] = oof_test_skf.mean(axis=1)
        
        # Оценка OOF
        oof_score = roc_auc_score(y_train, oof_train[:, model_idx])
        print(f"{name} OOF AUC: {oof_score:.6f}")
    
    return oof_train, oof_test

# Получение OOF предсказаний
print("\nГенерация OOF предсказаний...")
oof_train, oof_test = get_oof_predictions(
    models, X_train, y_train, X_test, n_folds=5
)

print(f"\n✓ OOF predictions готовы!")
print(f"OOF train shape: {oof_train.shape}")
print(f"OOF test shape: {oof_test.shape}")

In [None]:
# Обучение мета-модели (Level 2)
meta_model = LogisticRegression(random_state=42, max_iter=1000)
meta_model.fit(oof_train, y_train)

# Предсказания мета-модели
stacking_test_pred = meta_model.predict_proba(oof_test)[:, 1]

# Для валидации используем OOF на train
stacking_train_pred = meta_model.predict_proba(oof_train)[:, 1]
stacking_score = roc_auc_score(y_train, stacking_train_pred)

print(f"\nStacking Validation AUC: {stacking_score:.6f}")

# Веса мета-модели
print("\nВеса в мета-модели:")
for name, coef in zip(models.keys(), meta_model.coef_[0]):
    print(f"{name}: {coef:.4f}")

## 7. Blending (простой стекинг)

In [None]:
# Blending - используем hold-out set
# Уже есть предсказания на X_val от базовых моделей

# Создаем признаки из предсказаний базовых моделей
blend_train = np.column_stack(list(base_predictions_val.values()))
blend_test = np.column_stack(list(base_predictions_test.values()))

# Обучаем мета-модель на валидационном сете
blend_meta = LogisticRegression(random_state=42, max_iter=1000)
blend_meta.fit(blend_train, y_val)

# Предсказания
blend_val_pred = blend_meta.predict_proba(blend_train)[:, 1]
blend_test_pred = blend_meta.predict_proba(blend_test)[:, 1]

blend_score = roc_auc_score(y_val, blend_val_pred)
print(f"Blending Validation AUC: {blend_score:.6f}")

## 8. Сравнение всех методов

In [None]:
import matplotlib.pyplot as plt

# Собираем все результаты
ensemble_results = pd.DataFrame([
    *[(name, score) for name, score in base_scores.items()],
    ('Simple Averaging', avg_score),
    ('Weighted Averaging', weighted_score),
    ('Voting', voting_score),
    ('Stacking (OOF)', stacking_score),
    ('Blending', blend_score)
], columns=['Method', 'Validation AUC'])

ensemble_results = ensemble_results.sort_values('Validation AUC', ascending=False)

print("\n" + "="*60)
print("СРАВНЕНИЕ ВСЕХ МЕТОДОВ АНСАМБЛИРОВАНИЯ")
print("="*60)
print(ensemble_results.to_string(index=False))
print("="*60)

# Визуализация
plt.figure(figsize=(12, 6))
plt.barh(ensemble_results['Method'], ensemble_results['Validation AUC'])
plt.xlabel('Validation AUC')
plt.title('Сравнение методов ансамблирования')
plt.tight_layout()
plt.show()

## 9. Оптимизация весов

In [None]:
from scipy.optimize import minimize

def ensemble_score_function(weights, predictions, y_true):
    """Функция для оптимизации весов"""
    weighted_pred = sum(w * p for w, p in zip(weights, predictions))
    return -roc_auc_score(y_true, weighted_pred)  # Минимизируем негативный AUC

# Начальные веса (равные)
initial_weights = [1.0 / len(models)] * len(models)

# Ограничения: сумма весов = 1, каждый вес >= 0
constraints = ({'type': 'eq', 'fun': lambda w: sum(w) - 1})
bounds = [(0, 1) for _ in range(len(models))]

# Оптимизация
predictions_list = list(base_predictions_val.values())
result = minimize(
    ensemble_score_function,
    initial_weights,
    args=(predictions_list, y_val),
    method='SLSQP',
    bounds=bounds,
    constraints=constraints
)

optimal_weights = result.x

print("\nОптимальные веса:")
for name, weight in zip(models.keys(), optimal_weights):
    print(f"{name}: {weight:.4f}")

# Применение оптимальных весов
optimal_val_pred = sum(w * p for w, p in zip(optimal_weights, predictions_list))
optimal_test_pred = sum(w * p for w, p in zip(optimal_weights, base_predictions_test.values()))

optimal_score = roc_auc_score(y_val, optimal_val_pred)
print(f"\nOptimized Weights Validation AUC: {optimal_score:.6f}")

## 10. Submission

In [None]:
# Выбираем лучший метод
best_method = ensemble_results.iloc[0]['Method']
print(f"\nЛучший метод: {best_method}")

# Используем предсказания лучшего метода
# (замените на нужный)
final_predictions = stacking_test_pred  # или optimal_test_pred, blend_test_pred и т.д.

submission = pd.DataFrame({
    ID_COL: test_df[ID_COL],
    'prediction': final_predictions
})

submission.to_csv('ensemble_submission.csv', index=False)
print("\n✓ Submission сохранен!")
print(submission.head())