# Complete ML Pipeline - GLM vs XGBoost

This notebook breaks down the complete machine learning pipeline into executable cells.
Each major step is separated for easier execution and debugging.

**Pipeline Phases:**
1. Setup & Imports
2. Exploratory Data Analysis (EDA)
3. Data Preprocessing & Feature Engineering
4. GLM Model Development
5. XGBoost Model Development
6. Model Comparison & Selection
7. Model Interpretability (LIME)
8. Final Documentation

## 1. Setup & Imports

In [None]:
# Import libraries
import os
import pickle
import warnings
from datetime import datetime
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# ML imports
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, 
    roc_auc_score, classification_report, confusion_matrix
)
from sklearn.utils.class_weight import compute_class_weight

# XGBoost
import xgboost as xgb

# Imbalance handling
from imblearn.over_sampling import SMOTE

# Interpretability
try:
    import lime
    import lime.lime_tabular
    LIME_AVAILABLE = True
except ImportError:
    LIME_AVAILABLE = False
    print("LIME not available. Install with: pip install lime")

warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8-darkgrid')
%matplotlib inline

print("‚úì All libraries imported successfully")

: 

In [None]:
# Setup output directory
today = datetime.now().strftime('%Y-%m-%d')
output_dir = Path(f'../outputs/{today}')
output_dir.mkdir(parents=True, exist_ok=True)

# Create subdirectories
for subdir in ['eda', 'preprocessing', 'models', 'results', 'plots']:
    (output_dir / subdir).mkdir(exist_ok=True)

# Create symlink to latest run
latest_link = Path('../outputs/latest')
if latest_link.exists() or latest_link.is_symlink():
    latest_link.unlink()
try:
    latest_link.symlink_to(today, target_is_directory=True)
except (OSError, NotImplementedError):
    pass

print(f"‚úì Output directory created: {output_dir}")

## 2. Exploratory Data Analysis (EDA)

In [None]:
# Load data
print("Loading data...")
df_original = pd.read_excel('../data/Data.xlsx')

print(f"‚úì Data loaded successfully")
print(f"  Shape: {df_original.shape}")
print(f"\nFirst few rows:")
df_original.head()

In [None]:
# Basic information
print("=" * 80)
print("DATASET SUMMARY")
print("=" * 80)
print(f"Total records: {len(df_original):,}")
print(f"Features: {len([col for col in df_original.columns if col.startswith('V')])} features")
print(f"Target: Y (binary classification)")
print(f"Missing values: {df_original.isnull().sum().sum()}")

print(f"\nData types:")
df_original.dtypes

In [None]:
# Target variable analysis
print("TARGET VARIABLE ANALYSIS")
print("=" * 50)

target_counts = df_original['Y'].value_counts().sort_index()
target_props = df_original['Y'].value_counts(normalize=True).sort_index()

print(f"Class distribution:")
print(target_counts)
print(f"\nClass proportions:")
print(target_props)

# Visualize target distribution
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# Count plot
target_counts.plot(kind='bar', ax=axes[0], color=['skyblue', 'coral'])
axes[0].set_title('Target Variable Distribution (Counts)')
axes[0].set_xlabel('Class')
axes[0].set_ylabel('Count')
axes[0].set_xticklabels(axes[0].get_xticklabels(), rotation=0)

# Proportion plot
target_props.plot(kind='bar', ax=axes[1], color=['skyblue', 'coral'])
axes[1].set_title('Target Variable Distribution (Proportions)')
axes[1].set_xlabel('Class')
axes[1].set_ylabel('Proportion')
axes[1].set_xticklabels(axes[1].get_xticklabels(), rotation=0)

plt.tight_layout()
plt.savefig(output_dir / 'plots' / 'target_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Feature analysis
numeric_features = []
categorical_features = []

print("FEATURE ANALYSIS")
print("=" * 50)

for col in df_original.columns:
    if col.startswith('V'):
        if df_original[col].dtype in ['object', 'str', str]:
            categorical_features.append(col)
            print(f"{col} (categorical): {df_original[col].nunique()} unique values")
        else:
            numeric_features.append(col)

print(f"\nFeature Summary:")
print(f"  Numeric features: {len(numeric_features)}")
print(f"  Categorical features: {len(categorical_features)}")

In [None]:
# Correlation analysis
if numeric_features:
    print("CORRELATION ANALYSIS")
    print("=" * 50)
    
    correlations = df_original[numeric_features + ['Y']].corr()['Y'].sort_values(key=abs, ascending=False)[1:]
    
    print("Top 10 correlations with target:")
    print(correlations.head(10))
    
    # Visualize top correlations
    plt.figure(figsize=(10, 6))
    top_corr = correlations.head(15)
    colors = ['coral' if x < 0 else 'skyblue' for x in top_corr.values]
    top_corr.plot(kind='barh', color=colors)
    plt.title('Top 15 Feature Correlations with Target')
    plt.xlabel('Correlation Coefficient')
    plt.tight_layout()
    plt.savefig(output_dir / 'plots' / 'feature_correlations.png', dpi=300, bbox_inches='tight')
    plt.show()

In [None]:
# Save EDA summary
eda_summary_path = output_dir / 'eda' / 'eda_summary.txt'
with open(eda_summary_path, 'w') as f:
    f.write("=" * 80 + "\n")
    f.write("EDA Summary\n")
    f.write("=" * 80 + "\n")
    f.write(f"Dataset shape: {df_original.shape}\n")
    f.write(f"Missing values: {df_original.isnull().sum().sum()}\n")
    f.write(f"Target distribution:\n{target_counts.to_string()}\n")
    f.write(f"Numeric features: {len(numeric_features)}\n")
    f.write(f"Categorical features: {len(categorical_features)}\n")
    if numeric_features:
        f.write(f"\nTop correlations:\n{correlations.head(10).to_string()}\n")

print(f"‚úì EDA summary saved to: {eda_summary_path}")
print("‚úì Phase 1 (EDA) completed successfully!")

## 3. Data Preprocessing & Feature Engineering

In [None]:
# Copy data for processing
print("=" * 80)
print("DATA PREPROCESSING & FEATURE ENGINEERING")
print("=" * 80)

df = df_original.copy()
print(f"‚úì Working with copy of original data: {df.shape}")

In [None]:
# Categorical encoding
print("\nCategorical Feature Encoding:")
categorical_features = [col for col in df.columns if col.startswith('V') and df[col].dtype == 'object']

label_encoder = LabelEncoder()
for col in categorical_features:
    print(f"  Encoding {col}...")
    df[col] = label_encoder.fit_transform(df[col])

print(f"‚úì Encoded {len(categorical_features)} categorical features")

In [None]:
# Feature Engineering
print("\nFeature Engineering:")
feature_cols = [col for col in df.columns if col.startswith('V')]

# Interaction features
if 'V13' in feature_cols and 'V3' in feature_cols:
    df['V13_V3_interaction'] = df['V13'] * df['V3']
    print("  ‚úì Created V13_V3_interaction")

if 'V13' in feature_cols and 'V7' in feature_cols:
    df['V13_V7_interaction'] = df['V13'] * df['V7']
    print("  ‚úì Created V13_V7_interaction")

# Polynomial features
if 'V13' in feature_cols:
    df['V13_squared'] = df['V13'] ** 2
    print("  ‚úì Created V13_squared")

if 'V3' in feature_cols:
    df['V3_squared'] = df['V3'] ** 2
    print("  ‚úì Created V3_squared")

# Ratio features
if 'V2' in feature_cols and 'V18' in feature_cols:
    df['V2_V18_ratio'] = df['V2'] / (df['V18'] + 1)
    print("  ‚úì Created V2_V18_ratio")

# Aggregate features
df['feature_sum'] = df[feature_cols].sum(axis=1)
df['feature_mean'] = df[feature_cols].mean(axis=1)
df['feature_std'] = df[feature_cols].std(axis=1)
print("  ‚úì Created aggregate features: sum, mean, std")

df_engineered = df
print(f"\n‚úì Feature engineering completed")
print(f"  Total features: {len([col for col in df.columns if col not in ['ID', 'Y']])}")

In [None]:
# Prepare features and target
X = df.drop(['ID', 'Y'], axis=1, errors='ignore')
y = df['Y']

print(f"Features: {X.shape[1]} columns")
print(f"Target distribution: {y.value_counts().sort_index().to_dict()}")
print(f"\nFeature columns: {list(X.columns)}")

In [None]:
# Create data splits
print("\nCreating Data Splits:")
print("  Strategy: 65% train / 15% validation / 20% test\n")

# First split: train+val vs test (80/20)
X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

# Second split: train vs val (65/15 of total)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.1875, random_state=42, stratify=y_temp
)

print(f"  Train: {X_train.shape[0]:,} samples ({X_train.shape[0]/len(df)*100:.1f}%)")
print(f"  Validation: {X_val.shape[0]:,} samples ({X_val.shape[0]/len(df)*100:.1f}%)")
print(f"  Test: {X_test.shape[0]:,} samples ({X_test.shape[0]/len(df)*100:.1f}%)")

print(f"\n‚úì Data splits created successfully")

In [None]:
# Feature Scaling
print("\nFeature Scaling:")
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Convert back to DataFrames
X_train = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_val = pd.DataFrame(X_val_scaled, columns=X_val.columns)
X_test = pd.DataFrame(X_test_scaled, columns=X_test.columns)

print(f"  ‚úì Features scaled: {X_train.shape[1]} columns")
print(f"  Scaling method: StandardScaler (mean=0, std=1)")

In [None]:
# Handle class imbalance with SMOTE
print("\nHandling Class Imbalance with SMOTE:")
print(f"  Original class distribution: {y_train.value_counts().sort_index().to_dict()}")

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Store SMOTE data
X_train_smote = pd.DataFrame(X_train_smote, columns=X_train.columns)
y_train_smote = pd.Series(y_train_smote, name=y_train.name)

print(f"  Resampled class distribution: {y_train_smote.value_counts().sort_index().to_dict()}")
print(f"  Resampled size: {len(X_train_smote):,} samples")
print(f"\n‚úì SMOTE resampling completed")

In [None]:
# Calculate class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

print(f"Calculated class weights: {class_weight_dict}")
print("  (For models that support class_weight parameter)")

In [None]:
# Save processed data
preprocessing_dir = output_dir / 'preprocessing'

# Save datasets
X_train.to_pickle(preprocessing_dir / 'X_train.pkl')
X_val.to_pickle(preprocessing_dir / 'X_val.pkl')
X_test.to_pickle(preprocessing_dir / 'X_test.pkl')
y_train.to_pickle(preprocessing_dir / 'y_train.pkl')
y_val.to_pickle(preprocessing_dir / 'y_val.pkl')
y_test.to_pickle(preprocessing_dir / 'y_test.pkl')
X_train_smote.to_pickle(preprocessing_dir / 'X_train_smote.pkl')
y_train_smote.to_pickle(preprocessing_dir / 'y_train_smote.pkl')

# Save preprocessors
with open(preprocessing_dir / 'scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open(preprocessing_dir / 'label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)
with open(preprocessing_dir / 'class_weights.pkl', 'wb') as f:
    pickle.dump(class_weight_dict, f)

print(f"‚úì All processed data saved to: {preprocessing_dir}")
print("‚úì Phase 2 (Preprocessing) completed successfully!")

## 4. GLM Model Development

In [None]:
print("=" * 80)
print("GLM MODEL DEVELOPMENT")
print("=" * 80)

# Use SMOTE data for GLM training
X_train_use = X_train_smote
y_train_use = y_train_smote

print(f"Training data: {X_train_use.shape[0]:,} samples with {X_train_use.shape[1]} features")
print(f"Class distribution: {y_train_use.value_counts().sort_index().to_dict()}")

In [None]:
# GLM hyperparameter tuning
print("\nGLM Hyperparameter Tuning with GridSearchCV:")

# Parameter grid
param_grid = {
    'C': [0.01, 0.1, 1.0, 10.0, 100.0],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'solver': ['liblinear', 'saga'],
    'class_weight': [None, 'balanced']
}

print(f"Parameter grid: {param_grid}")

# Create GLM model
glm = LogisticRegression(random_state=42, max_iter=1000)

# Grid search
print("\nRunning GridSearchCV (this may take a while)...")
grid_search = GridSearchCV(
    glm, param_grid, cv=5, scoring='roc_auc',
    n_jobs=-1, verbose=1
)

grid_search.fit(X_train_use, y_train_use)

glm_model = grid_search.best_estimator_

print(f"\n‚úì Best GLM parameters: {grid_search.best_params_}")
print(f"‚úì Best GLM CV score (AUC): {grid_search.best_score_:.4f}")

In [None]:
# Evaluate GLM on all splits
def evaluate_model(model, model_name, X_train, y_train, X_val, y_val, X_test, y_test):
    """Evaluate model on train, validation, and test sets"""
    results = {'model_name': model_name}
    
    for split_name, X_data, y_data in [
        ('train', X_train, y_train),
        ('validation', X_val, y_val),
        ('test', X_test, y_test)
    ]:
        # Make predictions
        y_pred = model.predict(X_data)
        y_pred_proba = model.predict_proba(X_data)[:, 1]
        
        # Calculate metrics
        results[f'{split_name}_accuracy'] = accuracy_score(y_data, y_pred)
        results[f'{split_name}_precision'] = precision_score(y_data, y_pred)
        results[f'{split_name}_recall'] = recall_score(y_data, y_pred)
        results[f'{split_name}_f1'] = f1_score(y_data, y_pred)
        results[f'{split_name}_auc'] = roc_auc_score(y_data, y_pred_proba)
        
        print(f"\n{split_name.capitalize()} Results:")
        print(f"  Accuracy:  {results[f'{split_name}_accuracy']:.4f}")
        print(f"  Precision: {results[f'{split_name}_precision']:.4f}")
        print(f"  Recall:    {results[f'{split_name}_recall']:.4f}")
        print(f"  F1-Score:  {results[f'{split_name}_f1']:.4f}")
        print(f"  AUC:       {results[f'{split_name}_auc']:.4f}")
    
    return results

print("GLM Model Evaluation:")
glm_results = evaluate_model(glm_model, 'GLM', X_train, y_train, X_val, y_val, X_test, y_test)

In [None]:
# Save GLM model
models_dir = output_dir / 'models'
with open(models_dir / 'glm_model.pkl', 'wb') as f:
    pickle.dump(glm_model, f)

print(f"‚úì GLM model saved to: {models_dir / 'glm_model.pkl'}")
print("‚úì Phase 3 (GLM) completed successfully!")

## 5. XGBoost Model Development

In [None]:
print("=" * 80)
print("XGBOOST MODEL DEVELOPMENT")
print("=" * 80)

# Use original training data with class weights for XGBoost
X_train_use = X_train
y_train_use = y_train

print(f"Training data: {X_train_use.shape[0]:,} samples with {X_train_use.shape[1]} features")
print(f"Class distribution: {y_train_use.value_counts().sort_index().to_dict()}")

In [None]:
# Calculate scale_pos_weight for XGBoost
scale_pos_weight = len(y_train_use[y_train_use == 0]) / len(y_train_use[y_train_use == 1])
print(f"\nCalculated scale_pos_weight: {scale_pos_weight:.2f}")
print("  (Ratio of negative to positive samples for imbalance handling)")

In [None]:
# XGBoost hyperparameter tuning
print("\nXGBoost Hyperparameter Tuning with GridSearchCV:")

# Parameter grid (simplified for faster execution)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 6],
    'learning_rate': [0.1, 0.2],
    'scale_pos_weight': [1, scale_pos_weight]
}

print(f"Parameter grid: {param_grid}")

# Create XGBoost model
xgb_model = xgb.XGBClassifier(
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

# Grid search
print("\nRunning GridSearchCV (this may take a while)...")
grid_search = GridSearchCV(
    xgb_model, param_grid, cv=3, scoring='roc_auc',
    n_jobs=-1, verbose=1
)

grid_search.fit(X_train_use, y_train_use)

xgb_model = grid_search.best_estimator_

print(f"\n‚úì Best XGBoost parameters: {grid_search.best_params_}")
print(f"‚úì Best XGBoost CV score (AUC): {grid_search.best_score_:.4f}")

In [None]:
# Evaluate XGBoost on all splits
print("XGBoost Model Evaluation:")
xgb_results = evaluate_model(xgb_model, 'XGBoost', X_train, y_train, X_val, y_val, X_test, y_test)

In [None]:
# Feature importance
print("\nXGBoost Feature Importance:")
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance.head(15))

# Visualize feature importance
plt.figure(figsize=(10, 6))
plt.barh(range(15), feature_importance['importance'].head(15), color='skyblue')
plt.yticks(range(15), feature_importance['feature'].head(15))
plt.xlabel('Importance')
plt.title('XGBoost - Top 15 Feature Importances')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.savefig(output_dir / 'plots' / 'xgb_feature_importance.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Save XGBoost model
models_dir = output_dir / 'models'
with open(models_dir / 'xgboost_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

print(f"‚úì XGBoost model saved to: {models_dir / 'xgboost_model.pkl'}")
print("‚úì Phase 4 (XGBoost) completed successfully!")

## 6. Model Comparison & Selection

In [None]:
print("=" * 80)
print("MODEL COMPARISON & SELECTION")
print("=" * 80)

In [None]:
# Create comparison table
results = {'glm': glm_results, 'xgboost': xgb_results}

comparison_data = []
metrics = ['test_accuracy', 'test_precision', 'test_recall', 'test_f1', 'test_auc']

for model_name, res in results.items():
    row = {'Model': res['model_name']}
    for metric in metrics:
        if metric in res:
            row[metric.replace('test_', '').upper()] = f"{res[metric]:.4f}"
    comparison_data.append(row)

comparison_df = pd.DataFrame(comparison_data)
print("\nModel Performance Comparison (Test Set):")
print("=" * 50)
print(comparison_df.to_string(index=False))

In [None]:
# Determine best model
best_model_name = None
best_auc = 0

for model_name, res in results.items():
    if res.get('test_auc', 0) > best_auc:
        best_auc = res['test_auc']
        best_model_name = model_name

print(f"\n{'='*50}")
print(f"BEST MODEL: {best_model_name.upper()}")
print(f"Test AUC: {best_auc:.4f}")
print(f"{'='*50}")

In [None]:
# Visualize model comparison
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Comparison bar chart
metrics_short = ['ACCURACY', 'PRECISION', 'RECALL', 'F1', 'AUC']
glm_scores = [float(comparison_df.loc[comparison_df['Model'] == 'GLM', m].values[0]) for m in metrics_short]
xgb_scores = [float(comparison_df.loc[comparison_df['Model'] == 'XGBoost', m].values[0]) for m in metrics_short]

x = np.arange(len(metrics_short))
width = 0.35

axes[0].bar(x - width/2, glm_scores, width, label='GLM', color='skyblue')
axes[0].bar(x + width/2, xgb_scores, width, label='XGBoost', color='coral')
axes[0].set_xlabel('Metrics')
axes[0].set_ylabel('Score')
axes[0].set_title('Model Comparison - Test Set Performance')
axes[0].set_xticks(x)
axes[0].set_xticklabels(metrics_short, rotation=45)
axes[0].legend()
axes[0].set_ylim([0, 1.1])
axes[0].grid(axis='y', alpha=0.3)

# Performance across splits (AUC)
splits = ['Train', 'Validation', 'Test']
glm_aucs = [glm_results['train_auc'], glm_results['validation_auc'], glm_results['test_auc']]
xgb_aucs = [xgb_results['train_auc'], xgb_results['validation_auc'], xgb_results['test_auc']]

axes[1].plot(splits, glm_aucs, marker='o', label='GLM', linewidth=2, markersize=8, color='skyblue')
axes[1].plot(splits, xgb_aucs, marker='s', label='XGBoost', linewidth=2, markersize=8, color='coral')
axes[1].set_xlabel('Data Split')
axes[1].set_ylabel('AUC Score')
axes[1].set_title('AUC Performance Across Data Splits')
axes[1].legend()
axes[1].set_ylim([0, 1.1])
axes[1].grid(alpha=0.3)

plt.tight_layout()
plt.savefig(output_dir / 'plots' / 'model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()

In [None]:
# Save comparison results
results_dir = output_dir / 'results'
comparison_df.to_csv(results_dir / 'model_comparison.csv', index=False)

# Save detailed results
with open(results_dir / 'detailed_results.pkl', 'wb') as f:
    pickle.dump(results, f)

print(f"‚úì Comparison results saved to: {results_dir}")
print("‚úì Phase 5 (Model Comparison) completed successfully!")

## 7. Model Interpretability (LIME)

In [None]:
print("=" * 80)
print("MODEL INTERPRETABILITY ANALYSIS (LIME)")
print("=" * 80)

if not LIME_AVAILABLE:
    print("\n‚ö† LIME not available. Skipping interpretability analysis.")
    print("Install with: pip install lime")
else:
    print("\n‚úì LIME is available")

In [None]:
if LIME_AVAILABLE:
    # Create LIME explainer
    print("\nCreating LIME explainer...")
    explainer = lime.lime_tabular.LimeTabularExplainer(
        X_train.values,
        feature_names=X_train.columns.tolist(),
        class_names=['Class 0', 'Class 1'],
        mode='classification'
    )
    print("‚úì LIME explainer created")

In [None]:
if LIME_AVAILABLE:
    # Generate explanations for sample instances
    print("\nGenerating LIME explanations for sample test instances...")
    sample_indices = np.random.choice(len(X_test), size=min(10, len(X_test)), replace=False)
    
    explanations = {}
    
    for model_name, model in [('GLM', glm_model), ('XGBoost', xgb_model)]:
        print(f"\nGenerating explanations for {model_name}...")
        explanations[model_name] = []
        
        for i, idx in enumerate(sample_indices):
            instance = X_test.iloc[idx].values
            
            # Generate explanation
            exp = explainer.explain_instance(
                instance,
                model.predict_proba,
                num_features=10
            )
            
            explanations[model_name].append({
                'instance_id': idx,
                'prediction': model.predict_proba([instance])[0],
                'explanation': exp.as_list()
            })
        
        print(f"  ‚úì Generated {len(sample_indices)} explanations for {model_name}")
    
    # Save explanations
    results_dir = output_dir / 'results'
    with open(results_dir / 'lime_explanations.pkl', 'wb') as f:
        pickle.dump(explanations, f)
    
    print(f"\n‚úì LIME explanations saved to: {results_dir / 'lime_explanations.pkl'}")
    print("‚úì Phase 6 (Interpretability) completed successfully!")

In [None]:
if LIME_AVAILABLE:
    # Display sample explanation
    print("\nSample LIME Explanation (XGBoost, Instance 0):")
    print("=" * 50)
    sample_exp = explanations['XGBoost'][0]
    print(f"Prediction probabilities: {sample_exp['prediction']}")
    print(f"\nTop contributing features:")
    for feat, contrib in sample_exp['explanation'][:10]:
        print(f"  {feat}: {contrib:+.4f}")

## 8. Final Documentation

In [None]:
print("=" * 80)
print("FINAL DOCUMENTATION")
print("=" * 80)

In [None]:
# Generate final report
report_path = output_dir / 'final_report.txt'

with open(report_path, 'w') as f:
    f.write("=" * 80 + "\n")
    f.write("GLM vs XGBoost Pipeline - Final Report\n")
    f.write("=" * 80 + "\n")
    f.write(f"Generated: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
    
    # Dataset summary
    f.write("DATASET SUMMARY:\n")
    f.write(f"- Original shape: {df_original.shape}\n")
    f.write(f"- Engineered shape: {df_engineered.shape}\n")
    f.write(f"- Training samples: {len(X_train)}\n")
    f.write(f"- Validation samples: {len(X_val)}\n")
    f.write(f"- Test samples: {len(X_test)}\n\n")
    
    # Model results
    f.write("MODEL PERFORMANCE:\n")
    for model_name, res in results.items():
        f.write(f"\n{res['model_name']}:\n")
        f.write(f"  Test AUC:      {res.get('test_auc', 'N/A'):.4f}\n")
        f.write(f"  Test Accuracy: {res.get('test_accuracy', 'N/A'):.4f}\n")
        f.write(f"  Test F1-Score: {res.get('test_f1', 'N/A'):.4f}\n")
        f.write(f"  Test Recall:   {res.get('test_recall', 'N/A'):.4f}\n")
        f.write(f"  Test Precision:{res.get('test_precision', 'N/A'):.4f}\n")
    
    f.write(f"\n{'='*50}\n")
    f.write(f"BEST MODEL: {best_model_name.upper()} (Test AUC: {best_auc:.4f})\n")
    f.write(f"{'='*50}\n")
    
    f.write(f"\nOutput directory: {output_dir}\n")
    f.write("Pipeline completed successfully!\n")

print(f"‚úì Final report saved to: {report_path}")

In [None]:
# Display final summary
print("\n" + "=" * 80)
print("PIPELINE SUMMARY")
print("=" * 80)
print(f"\nüìä Dataset:")
print(f"   - Original shape: {df_original.shape}")
print(f"   - Engineered features: {X_train.shape[1]}")
print(f"   - Train/Val/Test: {len(X_train)}/{len(X_val)}/{len(X_test)} samples")

print(f"\nü§ñ Models Trained:")
print(f"   - GLM (Logistic Regression)")
print(f"   - XGBoost")

print(f"\nüèÜ Best Model: {best_model_name.upper()}")
print(f"   - Test AUC: {best_auc:.4f}")

print(f"\nüìÅ Output Directory: {output_dir}")
print(f"   - Models saved in: {output_dir / 'models'}")
print(f"   - Results saved in: {output_dir / 'results'}")
print(f"   - Plots saved in: {output_dir / 'plots'}")

print("\n" + "=" * 80)
print("‚úÖ COMPLETE PIPELINE FINISHED SUCCESSFULLY!")
print("=" * 80)