In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    roc_auc_score, average_precision_score,
    precision_recall_curve, f1_score, classification_report,
    confusion_matrix
)
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from imblearn.ensemble import BalancedBaggingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.base import clone

In [40]:
def preprocess_data(app_data, prev_app):
    app_data_clean = app_data.copy()
    prev_clean = prev_app.copy()

    def cap_outliers(df, columns, threshold=3.5):
        for col in columns:
            if df[col].dtype in ['int64', 'float64']:
                median = df[col].median()
                mad = 1.4826 * np.median(np.abs(df[col] - median))
                lower = median - threshold * mad
                upper = median + threshold * mad
                df[col] = np.clip(df[col], lower, upper)
        return df

    num_cols = app_data_clean.select_dtypes(include=['int64', 'float64']).columns
    app_data_clean = cap_outliers(app_data_clean, num_cols)

    # previous_application обработка
    prev_clean['DAYS_DECISION'] = prev_clean['DAYS_DECISION'].abs()

    financial_cols = ['AMT_APPLICATION', 'AMT_CREDIT', 'AMT_ANNUITY', 'AMT_GOODS_PRICE']
    prev_clean = cap_outliers(prev_clean, financial_cols)

    prev_clean['APP_CREDIT_RATIO'] = np.where(
        prev_clean['AMT_CREDIT'] > 0,
        prev_clean['AMT_APPLICATION'] / prev_clean['AMT_CREDIT'],
        0
    )

    # Агрегация prev_app 
    prev_clean['APPROVED'] = (prev_clean['NAME_CONTRACT_STATUS'] == 'Approved').astype(int)

    prev_agg = prev_clean.groupby('SK_ID_CURR').agg({
        'AMT_CREDIT': ['mean', 'max', 'sum'],
        'NAME_CONTRACT_STATUS': 'count',
        'APPROVED': 'mean',
        'APP_CREDIT_RATIO': 'mean',
        'DAYS_DECISION': ['min', 'max']
    })

    prev_agg.columns = [
        'PREV_CREDIT_MEAN', 'PREV_CREDIT_MAX', 'PREV_CREDIT_SUM',
        'PREV_APP_COUNT', 'PREV_APPROVED_RATE',
        'MEAN_APP_CREDIT_RATIO', 'EARLIEST_DECISION', 'LATEST_DECISION'
    ]
    prev_agg.reset_index(inplace=True)

    # Объединение с app_data 
    df = app_data_clean.merge(prev_agg, on='SK_ID_CURR', how='left')

    # Feature Engineering 
    def safe_divide(a, b):
        return pd.Series(np.where(b != 0, a / b, 0), index=a.index)

    df['INCOME_CREDIT_RATIO'] = safe_divide(df['AMT_CREDIT'], df['AMT_INCOME_TOTAL'])
    df['ANNUITY_INCOME_RATIO'] = safe_divide(df['AMT_ANNUITY'], df['AMT_INCOME_TOTAL'])
    df['WORK_EXPERIENCE_RATIO'] = safe_divide(-df['DAYS_EMPLOYED'] / 365, -df['DAYS_BIRTH'] / 365)
    df['AGE'] = -df['DAYS_BIRTH'] / 365
    df['DAYS_BETWEEN_APPS'] = df['LATEST_DECISION'] - df['EARLIEST_DECISION']
    df['DEPENDENTS_RATIO'] = safe_divide(df['CNT_CHILDREN'], df['CNT_FAM_MEMBERS'])

    df['EXT_SOURCE_COMBINED'] = (
            df['EXT_SOURCE_1'].fillna(0) * 0.3 +
            df['EXT_SOURCE_2'].fillna(0) * 0.5 +
            df['EXT_SOURCE_3'].fillna(0) * 0.2
    )

    new_features = ['INCOME_CREDIT_RATIO', 'ANNUITY_INCOME_RATIO', 'WORK_EXPERIENCE_RATIO', 'DEPENDENTS_RATIO']
    df = cap_outliers(df, new_features)

    cols_to_drop = [
        'DAYS_BIRTH', 'DAYS_EMPLOYED', 'FLAG_MOBIL', 'FLAG_EMP_PHONE',
        'FLAG_WORK_PHONE', 'FLAG_CONT_MOBILE', 'FLAG_PHONE', 'FLAG_EMAIL',
        'EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3', 'SK_ID_CURR'
    ]
    df.drop(columns=[col for col in cols_to_drop if col in df.columns], inplace=True)
    df.fillna(0, inplace=True)

    return df

In [41]:
def add_log_features(X, cols):
    for col in cols:
        if col in X.columns:
            X[f'{col}_LOG'] = np.log1p(X[col])
    return X

In [42]:
def compute_best_threshold(y_true, y_proba):
    prec, rec, thresholds = precision_recall_curve(y_true, y_proba)
    f1_scores = 2 * (prec * rec) / (prec + rec + 1e-6)
    best_idx = np.argmax(f1_scores)
    return thresholds[best_idx], f1_scores[best_idx]


def evaluate_model(y_true, y_proba, threshold):
    y_pred = (y_proba >= threshold).astype(int)
    roc_auc = roc_auc_score(y_true, y_proba)
    pr_auc = average_precision_score(y_true, y_proba)
    f1 = f1_score(y_true, y_pred)
    print(f"Threshold: {threshold:.2f} | ROC-AUC: {roc_auc:.4f} | PR-AUC: {pr_auc:.4f} | F1: {f1:.4f}")
    print(classification_report(y_true, y_pred))
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))
    return roc_auc, pr_auc, f1


def get_preprocessor(X, num_cols, cat_cols):
    return ColumnTransformer(transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_cols)
    ])

In [43]:
def cross_val_pipeline(X, y, model, num_cols, cat_cols=None, use_smote=False, use_onehot=False,
                       preprocess_manually=False, verbose=True):
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    roc_aucs, pr_aucs, f1s = [], [], []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        if verbose:
            print(f"\nFold {fold}")

        X_train, X_test = X.iloc[train_idx].copy(), X.iloc[test_idx].copy()
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        if cat_cols:
            for col in cat_cols:
                X_train[col] = X_train[col].astype(str).fillna("missing")
                X_test[col] = X_test[col].astype(str).fillna("missing")
        if preprocess_manually:
            log_cols = [f'{c}_LOG' for c in ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY'] if
                        f'{c}_LOG' in X_train.columns]
            num_cols_full = list(set(num_cols) | set(log_cols))

            preprocessor = ColumnTransformer(transformers=[
                ('num', StandardScaler(), num_cols_full),
                ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_cols)
            ])

            X_train = preprocessor.fit_transform(X_train)
            X_test = preprocessor.transform(X_test)

        if use_onehot:
            X_train = pd.get_dummies(X_train, drop_first=True)
            X_test = pd.get_dummies(X_test, drop_first=True)
            X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

        if use_smote:
            sm = SMOTE(random_state=42)
            X_train, y_train = sm.fit_resample(X_train, y_train)

        clf = clone(model)
        clf.fit(X_train, y_train)
        y_proba = clf.predict_proba(X_test)[:, 1]

        threshold, best_f1 = compute_best_threshold(y_test, y_proba)
        roc_auc, pr_auc, f1 = evaluate_model(y_test, y_proba, threshold)

        roc_aucs.append(roc_auc)
        pr_aucs.append(pr_auc)
        f1s.append(f1)

    print("\n--- Средние метрики ---")
    print(f"ROC-AUC: {np.mean(roc_aucs):.4f}")
    print(f"PR-AUC: {np.mean(pr_aucs):.4f}")
    print(f"F1-score: {np.mean(f1s):.4f}")

In [44]:
app_data = pd.read_csv('application_data.csv')
prev_app = pd.read_csv('previous_application.csv')

In [45]:
app_data = app_data[app_data['TARGET'].isin([0, 1])]
X = preprocess_data(app_data, prev_app)
y = app_data['TARGET']

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

log_cols = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY']
X_train_full = add_log_features(X_train_full, log_cols)

num_cols = X_train_full.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train_full.select_dtypes(include=['object', 'category']).columns.tolist()

models = {
    'LogReg': Pipeline([
        ('prep', get_preprocessor(X, num_cols, cat_cols)),
        ('clf', LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear'))
    ]),
    'Bagging+LogReg': BalancedBaggingClassifier(
        estimator=LogisticRegression(max_iter=1000, class_weight='balanced', solver='liblinear'),
        n_estimators=10, n_jobs=-1, random_state=42
    ),

    'XGBoost': XGBClassifier(
        n_estimators=300, learning_rate=0.05, max_depth=6,
        scale_pos_weight=(y == 0).sum() / (y == 1).sum(),
        use_label_encoder=False, eval_metric='auc', n_jobs=-1, random_state=42
    ),
    'CatBoost': CatBoostClassifier(
        iterations=300, learning_rate=0.05, depth=6,
        eval_metric='AUC', verbose=0, random_seed=42
    ),

}

for name, model in models.items():
    print(f"\n========== {name} ==========")
    cross_val_pipeline(
        X_train_full.copy(), y_train_full, model,
        num_cols=num_cols,
        cat_cols=cat_cols,
        use_smote=False,
        use_onehot=(name in ['XGBoost', 'CatBoost']),
        preprocess_manually=(name == 'Bagging+LogReg'),
        verbose=True
    )

    print(f"\nОценка на тестовом наборе для {name}:")

    X_train_test = X_train_full.copy()
    X_test_test = X_test.copy()
    if name == 'LogReg':
        X_train_test = X_train_full.copy()
        X_test_test = X_test.copy()

        log_cols = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY']
        X_train_test = add_log_features(X_train_test, log_cols)
        X_test_test = add_log_features(X_test_test, log_cols)

        for col in cat_cols:
            if col in X_train_test.columns:
                X_train_test[col] = X_train_test[col].astype(str).fillna("missing")
                X_test_test[col] = X_test_test[col].astype(str).fillna("missing")

        required_cols = num_cols + [f'{c}_LOG' for c in log_cols] + cat_cols
        missing_cols = set(required_cols) - set(X_train_test.columns)
        if missing_cols:
            print(f"Предупреждение: отсутствуют столбцы {missing_cols}")

    elif name == 'Bagging+LogReg':
        log_cols = ['AMT_INCOME_TOTAL', 'AMT_CREDIT', 'AMT_ANNUITY']
        X_train_test = add_log_features(X_train_test, log_cols)
        X_test_test = add_log_features(X_test_test, log_cols)

        for col in cat_cols:
            X_train_test[col] = X_train_test[col].astype(str).fillna("missing")
            X_test_test[col] = X_test_test[col].astype(str).fillna("missing")

        log_cols_final = [f'{c}_LOG' for c in log_cols if f'{c}_LOG' in X_train_test.columns]
        num_cols_full = list(set(num_cols) | set(log_cols_final))

        num_cols_full = [col for col in num_cols_full if col in X_train_test.columns]
        valid_cat_cols = [col for col in cat_cols if col in X_train_test.columns]

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), num_cols_full),
                ('cat', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), valid_cat_cols)
            ],
            remainder='drop'
        )

        X_train_test = preprocessor.fit_transform(X_train_test)
        X_test_test = preprocessor.transform(X_test_test)

    elif name in ['XGBoost', 'CatBoost']:
        X_train_test = pd.get_dummies(X_train_test, columns=cat_cols)
        X_test_test = pd.get_dummies(X_test_test, columns=cat_cols)

        X_train_test, X_test_test = X_train_test.align(X_test_test, join='left', axis=1, fill_value=0)

        X_train_test = X_train_test.values
        X_test_test = X_test_test.values

    final_model = clone(model)
    final_model.fit(X_train_test, y_train_full)

    y_proba_test = final_model.predict_proba(X_test_test)[:, 1]
    thresh_test, _ = compute_best_threshold(y_test, y_proba_test)
    evaluate_model(y_test, y_proba_test, thresh_test)