In [9]:
# --- Data Loading and Preprocessing Utilities for Google Colab ---
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, roc_auc_score
import xgboost as xgb



# File paths (Update paths based on your Drive folder structure)
train_path = '/content/fda_trainingset.csv'
test_path = '/content/fda_testset.csv'
sample_submission_path = '/content/sample_submission_FDA_file.csv'

# Load datasets
train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)
sample_submission = pd.read_csv(sample_submission_path)

# Drop unnecessary columns
cols_to_drop = [
    'max_monthly_income', 'min_monthly_income', 'nomalized_income_std_dev',
    'loan_default_probability', 'last_month_income', 'last_month_savings',
    'predicted_next_month_income', 'predicted_next_month_savings',
    'avg_quarterly_expenditure', 'financial_wellbeing_index'
]

def preprocess(imputer=None, scaler=None):
    if imputer is None:
        imputer = KNNImputer(n_neighbors=5)

    # Auto-detect target column
    target_col = 'Unnamed: 78' if 'Unnamed: 78' in train_df.columns else 'Y'

    X = train_df.drop(columns=[target_col] + cols_to_drop)
    y = train_df[target_col]
    X_test = test_df[X.columns]  # Ensure matching features

    # Apply imputer
    X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
    X_test = pd.DataFrame(imputer.transform(X_test), columns=X.columns)

    # Scale if needed
    if scaler:
        X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
        X_test = pd.DataFrame(scaler.transform(X_test), columns=X.columns)

    return X, y, X_test


def train_and_submit(model, X, y, X_test, submission_name):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_proba = model.predict_proba(X_val)[:, 1]
    precision = precision_score(y_val, (y_proba > 0.5).astype(int))
    recall = recall_score(y_val, (y_proba > 0.5).astype(int))
    auc = roc_auc_score(y_val, y_proba)
    print(f'{submission_name}: Precision={precision:.4f}, Recall={recall:.4f}, AUC-ROC={auc:.4f}')
    submission = sample_submission.copy()
    submission['Y'] = model.predict_proba(X_test)[:, 1]
    submission = pd.DataFrame({'ID': test_df['ID'], 'Y': test_proba})
    submission.to_csv(f'/content/{filename}.csv', index=False)
    print(f'Saved: {submission_path}')
    return precision, recall, auc

# Run XGBoost (mean imputation, StandardScaler)
X, y, X_test = preprocess(SimpleImputer(strategy='mean'), StandardScaler())
train_and_submit(
    xgb.XGBClassifier(
        n_estimators=200,
        max_depth=7,
        learning_rate=0.05,
        random_state=42,
        use_label_encoder=False,
        eval_metric='logloss'
    ),
    X, y, X_test,
    'submission_xgboost_mean_std'
)

Parameters: { "use_label_encoder" } are not used.



submission_xgboost_mean_std: Precision=0.7500, Recall=0.0309, AUC-ROC=0.9556


NameError: name 'test_proba' is not defined