## Setup

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.compose import ColumnTransformer

# Add parent directory to path
root = Path.cwd().parent
if str(root) not in sys.path:
    sys.path.append(str(root))

In [None]:
from src.kaggle_utils import download_competition

data_dir = download_competition("playground-series-s5e11")
data_dir

In [None]:
# Load data
data = pd.read_csv(data_dir / "train.csv")
test = pd.read_csv(data_dir / "test.csv")

# Train/validation split
train, valid = train_test_split(data, test_size=0.2, random_state=42)

print(f"Train shape: {train.shape}")
print(f"Valid shape: {valid.shape}")
print(f"Test shape: {test.shape}")

## Feature Engineering Pipeline

In [None]:
# Define feature groups
cats = ['gender', 'marital_status', 'education_level', 'employment_status', 'loan_purpose', 'grade_subgrade']
conts = ['annual_income', 'debt_to_income_ratio', 'credit_score', 'loan_amount', 'interest_rate']
dep = 'loan_paid_back'

def create_features(df):
    """Feature engineering: encoding, log transforms, and new features"""
    df = df.copy()
    
    # Encode categorical variables
    for cat in cats:
        df[cat] = pd.Categorical(df[cat]).codes
    
    # Log transform skewed features
    df['loan_amount'] = np.log1p(df['loan_amount'])
    df['annual_income'] = np.log1p(df['annual_income'])
    
    # Create new features
    df['debt'] = df['debt_to_income_ratio'] * df['annual_income']
    df['loan_income_ratio'] = df['loan_amount'] / df['annual_income']
    
    return df

In [None]:
# Create preprocessing pipeline
preprocessor = FunctionTransformer(create_features)

# Prepare data
train_proc = preprocessor.transform(train)
valid_proc = preprocessor.transform(valid)
test_proc = preprocessor.transform(test)

# Split features and target
train_X = train_proc.drop(columns=[dep, 'id'])
train_y = train_proc[dep]

valid_X = valid_proc.drop(columns=[dep, 'id'])
valid_y = valid_proc[dep]

test_X = test_proc.drop(columns=['id'])
test_ids = test_proc['id']

print(f"Features: {list(train_X.columns)}")
print(f"Train X shape: {train_X.shape}")

## Model Training & Comparison

In [None]:
def evaluate_model(model, name):
    """Train model and evaluate on validation set"""
    model.fit(train_X, train_y)
    valid_preds = model.predict_proba(valid_X)[:, 1]
    roc_score = roc_auc_score(valid_y, valid_preds)
    print(f"{name:30s} ROC AUC: {roc_score:.5f}")
    return model, roc_score

### Baseline: Random Forest

In [None]:
rf = RandomForestClassifier(
    n_estimators=100, 
    min_samples_leaf=5, 
    max_features=8, 
    max_depth=10, 
    random_state=42,
    n_jobs=-1
)

rf_model, rf_score = evaluate_model(rf, "Random Forest")

### Gradient Boosting

In [None]:
gb = GradientBoostingClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)

gb_model, gb_score = evaluate_model(gb, "Gradient Boosting")

### XGBoost

In [None]:
try:
    from xgboost import XGBClassifier
    
    xgb = XGBClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        n_jobs=-1,
        eval_metric='logloss'
    )
    
    xgb_model, xgb_score = evaluate_model(xgb, "XGBoost")
except ImportError:
    print("XGBoost not installed. Run: pip install xgboost")

### LightGBM

In [None]:
try:
    from lightgbm import LGBMClassifier
    
    lgb = LGBMClassifier(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    
    lgb_model, lgb_score = evaluate_model(lgb, "LightGBM")
except ImportError:
    print("LightGBM not installed. Run: pip install lightgbm")

### Logistic Regression (Baseline)

In [None]:
# Scale features for logistic regression
scaler = StandardScaler()
train_X_scaled = scaler.fit_transform(train_X)
valid_X_scaled = scaler.transform(valid_X)

lr = LogisticRegression(max_iter=1000, random_state=42, n_jobs=-1)
lr.fit(train_X_scaled, train_y)
valid_preds = lr.predict_proba(valid_X_scaled)[:, 1]
lr_score = roc_auc_score(valid_y, valid_preds)
print(f"{'Logistic Regression':30s} ROC AUC: {lr_score:.5f}")

## Feature Importance (Best Model)

In [None]:
def show_feature_importance(model, top_n=10):
    """Display feature importance for tree-based models"""
    if hasattr(model, 'feature_importances_'):
        feature_imp = pd.DataFrame({
            'feature': train_X.columns,
            'importance': model.feature_importances_
        }).sort_values('importance', ascending=False).head(top_n).sort_values('importance', ascending=True)
        
        feature_imp.plot(x='feature', y='importance', kind='barh', 
                        figsize=(10, 6), title=f'Top {top_n} Feature Importances')
    else:
        print("Model does not have feature_importances_ attribute")

show_feature_importance(rf_model, 10)

## Submission

In [None]:
def create_submission(model, filename, scale=False):
    """Generate predictions and create submission file"""
    if scale:
        test_X_input = scaler.transform(test_X)
    else:
        test_X_input = test_X
    
    preds = model.predict_proba(test_X_input)[:, 1]
    submission = pd.DataFrame({
        'id': test_ids,
        'loan_paid_back': preds
    })
    
    output_dir = Path("playground-series-s5e11_3")
    output_dir.mkdir(exist_ok=True)
    
    filepath = output_dir / filename
    submission.to_csv(filepath, index=False)
    print(f"Submission saved to {filepath}")
    return submission

In [None]:
# Submit best model (update model name as needed)
submission = create_submission(rf_model, 'submission_rf.csv')
submission.head()

## Model Experimentation

Use cells below to experiment with hyperparameters, ensembles, etc.