In [22]:
import sys
import os
import pathlib

# Root Í≤ΩÎ°ú Ïû°Í∏∞
sys.path.append(os.path.abspath("../../"))

import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
import json
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import StandardScaler
from IPython.display import display, Markdown

# [MPS Acceleration] Í∞ÄÏÜç ÏßÄÏõê ÌôïÏù∏
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"üöÄ Using Device: {device}")

# [Import Model] 'Advanced' uses the proven MLP_enhance architecture for Ensemble
from models.model_definitions import MLP_enhance
print("‚úÖ MLP_enhance Model Imported.")


üöÄ Using Device: mps


# [MLP_advanced] Final Optimized Ensemble Model
- **Goal**: Validate the maximum performance using the best hyperparameters and Ensemble strategy.
- **Strategy**: 
    1. **Data**: Same clean pipeline as LG/DL1 (No anchors, Base Rate ~0.86 aligned).
    2. **Model**: 5-Model Ensemble of the optimized `MLP_enhance`.
    3. **Hyperparams**: Confirmed Best Params (lr=0.0006, hidden=1024, selu)


In [23]:
# 2. Îç∞Ïù¥ÌÑ∞ Î°úÎìú Î∞è Ï†ÑÏ≤òÎ¶¨ (Standardized)
print('Loading Data...')
base_path = "/Users/gimdabin/SKN23-2nd-3Team/data/processed/"

# [Correct] No anchors merge, match LG pipeline
features = pd.read_parquet(base_path + "features_ml_clean.parquet")
labels = pd.read_parquet(base_path + "labels.parquet")

features['user_id'] = features['user_id'].astype(str)
labels['user_id'] = labels['user_id'].astype(str)

data = features.merge(labels, on=['user_id', 'anchor_time'], how='inner')
data['target'] = (data['label'] == 'm2').astype(int)

feature_cols = [c for c in features.columns if c not in ['user_id', 'anchor_time']]
X = data[feature_cols].copy().fillna(0)
y = data['target'].values


Loading Data...


In [24]:
# 3. Îç∞Ïù¥ÌÑ∞ ÎÇòÎàÑÍ∏∞ (Time-based Split)
assert "split" in data.columns, "Missing 'split' column!"

split = data["split"].astype(str).str.lower().values
train_mask = split == "train"
val_mask   = np.isin(split, ["val", "valid", "validation"])
test_mask  = split == "test"

X_train, y_train = X.loc[train_mask].values, y[train_mask]
X_val, y_val     = X.loc[val_mask].values, y[val_mask]
X_test, y_test   = X.loc[test_mask].values, y[test_mask]

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")
print(f"Train Base Rate: {y_train.mean():.4f}")
print(f"Val   Base Rate: {y_val.mean():.4f}")
print(f"Test  Base Rate: {y_test.mean():.4f}")


Train: 574092, Val: 137615, Test: 101833
Train Base Rate: 0.8041
Val   Base Rate: 0.8417
Test  Base Rate: 0.8671


In [None]:
# 4. Scaling & DataLoader
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader, TensorDataset
import torch

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

train_loader = DataLoader(TensorDataset(torch.FloatTensor(X_train_scaled), torch.FloatTensor(y_train)), batch_size=256, shuffle=True)
val_loader = DataLoader(TensorDataset(torch.FloatTensor(X_val_scaled), torch.FloatTensor(y_val)), batch_size=256)
test_loader = DataLoader(TensorDataset(torch.FloatTensor(X_test_scaled), torch.FloatTensor(y_test)), batch_size=256)

print("‚úÖ SMOTE Removed. Using raw training data.")


‚úÖ SMOTE Removed. Using raw training data.


In [26]:
# 5. Best Hyperparameters (Optimized for Speed)
# [Optimization] Epochs reduced to 10 for faster Ensemble training
best_params = {
    'lr': 0.0006218704727769079,
    'weight_decay': 0.0005829384542994739,
    'hidden_dim': 1024,
    'dropout_rate': 0.2628094190643375,
    'epochs': 10,   # Reduced from 15->10
    'activation': 'selu',
    'optimizer': 'AdamW'
}
print(f"‚úÖ Applied Fast Params: {best_params}")


‚úÖ Applied Fast Params: {'lr': 0.0006218704727769079, 'weight_decay': 0.0005829384542994739, 'hidden_dim': 1024, 'dropout_rate': 0.2628094190643375, 'epochs': 10, 'activation': 'selu', 'optimizer': 'AdamW'}


In [27]:
# 6. Ensemble Training (5 Models)
ensemble_models = []
criterion = nn.BCEWithLogitsLoss()

LR = best_params['lr']
WD = best_params['weight_decay']
HIDDEN = best_params['hidden_dim']
DROP = best_params['dropout_rate']
EPOCHS = best_params['epochs']
ACT = best_params['activation']
OPT = best_params['optimizer']

print(f"üöÄ Starting Ensemble Training (5 Models)...")

for i in range(5):
    print(f"\n[Model {i+1}/5] Training...")
    model = MLP_enhance(X.shape[1], hidden_dim=HIDDEN, dropout_rate=DROP, activation=ACT).to(device)
    
    if OPT == 'Adam': optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WD)
    elif OPT == 'AdamW': optimizer = optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
    elif OPT == 'SGD': optimizer = optim.SGD(model.parameters(), lr=LR, momentum=0.9, weight_decay=WD)
    elif OPT == 'RMSprop': optimizer = optim.RMSprop(model.parameters(), lr=LR, weight_decay=WD)
    
    model.train()
    for epoch in range(EPOCHS):
        epoch_loss = 0
        for inputs, targets in train_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()
            
    ensemble_models.append(model)

print("\n‚úÖ Ensemble Training Complete!")


üöÄ Starting Ensemble Training (5 Models)...

[Model 1/5] Training...

[Model 2/5] Training...

[Model 3/5] Training...

[Model 4/5] Training...

[Model 5/5] Training...

‚úÖ Ensemble Training Complete!


In [28]:
# 7. Ensemble Evaluation
from app.utils.metrics import evaluate_churn_metrics

print("Evaluating Ensemble (Averaging Predictions)...")
all_targets = []
all_probs = []

with torch.no_grad():
    for inputs, targets in test_loader:
        inputs = inputs.to(device)
        metrics_sum = torch.zeros(inputs.size(0)).to(device)
        for model in ensemble_models:
            model.eval()
            outputs = model(inputs).squeeze()
            probs = torch.sigmoid(outputs)
            metrics_sum += probs
        
        avg_probs = metrics_sum / len(ensemble_models)
        
        all_targets.extend(targets.numpy())
        all_probs.extend(avg_probs.cpu().numpy())

# evaluate_churn_metrics without model_name
metrics_result = evaluate_churn_metrics(
    y_true=np.array(all_targets),
    y_prob=np.array(all_probs)
)

if 'ranking' in metrics_result:
    ranking_list = metrics_result.pop('ranking')
    top_k_df = pd.DataFrame(ranking_list)
else:
    top_k_df = pd.DataFrame()

metrics = metrics_result

display(Markdown("### üìä Ensemble Performance (N=5)"))
display(pd.DataFrame(list(metrics.items()), columns=['KPI', 'Value']))
display(Markdown("### üìà Top K% Ranking"))
display(top_k_df)


Evaluating Ensemble (Averaging Predictions)...


### üìä Ensemble Performance (N=5)

Unnamed: 0,KPI,Value
0,ROC-AUC,0.712246
1,PR-AUC (Average Precision),0.932985
2,Brier Score,0.106932
3,ÏÉÅÏúÑ 5% Ï†ïÎ∞ÄÎèÑ (Precision),0.967786
4,ÏÉÅÏúÑ 5% Ïû¨ÌòÑÏú® (Recall),0.055797
5,ÏÉÅÏúÑ 5% Î¶¨ÌîÑÌä∏ (Lift),1.116086


### üìà Top K% Ranking

Unnamed: 0,Top_K,Precision,Recall,Lift
0,5%,0.967786,0.055797,1.116086
1,10%,0.962683,0.111017,1.1102
2,15%,0.958623,0.165817,1.105518
3,20%,0.954237,0.220086,1.100461
4,25%,0.950114,0.273924,1.095705
5,30%,0.947003,0.327626,1.092118


In [29]:
# 8. Artifact Saving (LG Style Standardized)
import json
import pathlib

# [Correct Path] dlmlp_advanced
MODEL_ID = "dlmlp_advanced"
EVAL_DIR = pathlib.Path(f"../../models/eval/{MODEL_ID}")
EVAL_DIR.mkdir(parents=True, exist_ok=True)

# 1. Model Card
model_card = {
    "model_id": MODEL_ID,
    "display_name": "MLP Advanced (N=5)",
    "category": "DL",
    "split": "test"
}
with open(EVAL_DIR / "model_card.json", "w") as f:
    json.dump(model_card, f, indent=2, ensure_ascii=False)

# 2. PR Metrics
pr_metrics = {
    "model_id": MODEL_ID,
    "split": "test",
    "pr_auc": float(metrics.get("PR-AUC (Average Precision)", 0.0))
}
with open(EVAL_DIR / "pr_metrics.json", "w") as f:
    json.dump(pr_metrics, f, indent=2, ensure_ascii=False)

# 3. Top K Metrics (LG Style)
current_base_rate = float(np.mean(all_targets)) if 'all_targets' in locals() else 0.0

metrics_by_k = []
if not top_k_df.empty:
    for _, row in top_k_df.iterrows():
        k_str = str(row.get('Top_K', '0')).replace('%', '')
        try:
            k_val = int(k_str)
        except:
            k_val = 0
        metrics_by_k.append({
            "k_pct": k_val,
            "precision_at_k": float(row.get('Precision', 0.0)),
            "recall_at_k": float(row.get('Recall', 0.0)),
            "lift_at_k": float(row.get('Lift', 0.0))
        })

topk_output = {
    "model_id": MODEL_ID,
    "split": "test",
    "base_rate": current_base_rate,
    "metrics_by_k": metrics_by_k
}

with open(EVAL_DIR / "topk_metrics.json", "w") as f:
    json.dump(topk_output, f, indent=2, ensure_ascii=False)

print(f"‚úÖ All artifacts saved to {EVAL_DIR}")


‚úÖ All artifacts saved to ../../models/eval/dlmlp_advanced
