In [None]:
# --- Data Loading and Preprocessing Utilities ---
import pandas as pd
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, roc_auc_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import os


train_df = pd.read_csv('/kaggle/input/competition/fda_trainingset.csv')
test_df = pd.read_csv('/kaggle/input/competition/fda_testset.csv')
sample_submission = pd.read_csv('/kaggle/input/competition/Submission_File_FDA.csv')

cols_to_drop = [
    'max_monthly_income', 'min_monthly_income', 'nomalized_income_std_dev',
    'loan_default_probability', 'last_month_income', 'last_month_savings',
    'predicted_next_month_income', 'predicted_next_month_savings',
    'avg_quarterly_expenditure', 'financial_wellbeing_index'
]

def preprocess(imputer, scaler):
    X = train_df.drop(columns=['Unnamed: 78'] + cols_to_drop)
    y = train_df['Unnamed: 78']
    X_test = test_df[X.columns]
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    X_test = pd.DataFrame(imputer.transform(X_test), columns=X_test.columns)
    if scaler is not None:
        X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
    return X, y, X_test

def train_and_submit(model, X, y, X_test, submission_name):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_val)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_val)
    precision = precision_score(y_val, (y_proba > 0.5).astype(int))
    recall = recall_score(y_val, (y_proba > 0.5).astype(int))
    auc = roc_auc_score(y_val, y_proba)
    print(f'{submission_name}: Precision={precision:.4f}, Recall={recall:.4f}, AUC-ROC={auc:.4f}')
    submission = sample_submission.copy()
    submission['Y'] = model.predict_proba(X_test)[:, 1] if hasattr(model, 'predict_proba') else model.decision_function(X_test)
    submission_path = submission_name + '.csv'
    submission.to_csv(submission_path, index=False)
    print(f'Saved: {submission_path}')
    return precision, recall, auc

# 1. LightGBM (mean imputation, StandardScaler)
X, y, X_test = preprocess(SimpleImputer(strategy='mean'), StandardScaler())
train_and_submit(lgb.LGBMClassifier(n_estimators=200, max_depth=7, learning_rate=0.05, random_state=42), X, y, X_test, 'submission_lightgbm_mean_std')

# 2. XGBoost (mean imputation, StandardScaler)
X, y, X_test = preprocess(SimpleImputer(strategy='mean'), StandardScaler())
train_and_submit(xgb.XGBClassifier(n_estimators=200, max_depth=7, learning_rate=0.05, random_state=42, use_label_encoder=False, eval_metric='logloss'), X, y, X_test, 'submission_xgboost_mean_std')

# 3. CatBoost (mean imputation, StandardScaler)
X, y, X_test = preprocess(SimpleImputer(strategy='mean'), StandardScaler())
train_and_submit(cb.CatBoostClassifier(iterations=200, max_depth=7, learning_rate=0.05, random_seed=42, verbose=0), X, y, X_test, 'submission_catboost_mean_std')

# 4. Random Forest (mean imputation, StandardScaler)
X, y, X_test = preprocess(SimpleImputer(strategy='mean'), StandardScaler())
train_and_submit(RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42), X, y, X_test, 'submission_random_forest_mean_std')

# 5. AdaBoost (mean imputation, StandardScaler)
X, y, X_test = preprocess(SimpleImputer(strategy='mean'), StandardScaler())
train_and_submit(AdaBoostClassifier(n_estimators=200, learning_rate=0.7, random_state=42), X, y, X_test, 'submission_adaboost_mean_std')

# 6. LightGBM (median imputation, MinMaxScaler)
X, y, X_test = preprocess(SimpleImputer(strategy='median'), MinMaxScaler())
train_and_submit(lgb.LGBMClassifier(n_estimators=200, max_depth=7, learning_rate=0.05, random_state=42), X, y, X_test, 'submission_lightgbm_median_minmax')

# 7. XGBoost (KNN imputer, StandardScaler)
X, y, X_test = preprocess(KNNImputer(n_neighbors=5), StandardScaler())
train_and_submit(xgb.XGBClassifier(n_estimators=200, max_depth=7, learning_rate=0.05, random_state=42, use_label_encoder=False, eval_metric='logloss'), X, y, X_test, 'submission_xgboost_knnimp_std')

# 8. CatBoost (mode imputation, no scaling)
X, y, X_test = preprocess(SimpleImputer(strategy='most_frequent'), None)
train_and_submit(cb.CatBoostClassifier(iterations=200, max_depth=7, learning_rate=0.05, random_seed=42, verbose=0), X, y, X_test, 'submission_catboost_mode_noscale')

# 9. Random Forest (KNN imputer, MinMaxScaler)
X, y, X_test = preprocess(KNNImputer(n_neighbors=5), MinMaxScaler())
train_and_submit(RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42), X, y, X_test, 'submission_random_forest_knnimp_minmax')

# 10. AdaBoost (median imputation, StandardScaler)
X, y, X_test = preprocess(SimpleImputer(strategy='median'), StandardScaler())
train_and_submit(AdaBoostClassifier(n_estimators=200, learning_rate=0.7, random_state=42), X, y, X_test, 'submission_adaboost_median_std')