In [16]:
from pathlib import Path
import pandas as pd
import numpy as np

PROJECT_ROOT = Path.cwd().parent
PROCESSED_PATH = PROJECT_ROOT / 'data/processed/application_features_baseline.csv'
RANDOM_SEED = 42

In [17]:
df = pd.read_csv(PROCESSED_PATH)

X = df.drop(columns='TARGET')
y = df['TARGET']

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(
    X, y,
    test_size=0.2,
    random_state=RANDOM_SEED,
    stratify=y
)

Stratification preseves default rate distribution among validation and train splits.

In [19]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(
    max_iter=10000,
    class_weight='balanced',
    n_jobs=-1
)

model.fit(X_train, y_train)

In [20]:
from sklearn.metrics import roc_auc_score, recall_score, precision_score

pred_proba = model.predict_proba(X_val)[:, 1]
pred = (pred_proba >= 0.5).astype(int)

roc_auc = roc_auc_score(y_val, pred_proba)
recall = recall_score(y_val, pred)
precision = precision_score(y_val, pred)

In [21]:
thresholds = np.linspace(0.1, 0.9, 9)
results = []

for t in thresholds:
    t_pred = (pred_proba >= t).astype(int)
    results.append({
        'threshold': t,
        'recall': recall_score(y_val, t_pred),
        'precision': precision_score(y_val, t_pred)
    })

pd.DataFrame(results)

Unnamed: 0,threshold,recall,precision
0,0.1,0.997382,0.081565
1,0.2,0.974421,0.088717
2,0.3,0.914804,0.103335
3,0.4,0.819134,0.125614
4,0.5,0.674522,0.155327
5,0.6,0.500705,0.195748
6,0.7,0.308157,0.253521
7,0.8,0.130312,0.329598
8,0.9,0.016918,0.340081


Higher threshold -> lower default risk, but less approvals

Lower threshold -> higher default risk, but more approvals

In [22]:
coef = pd.Series(
    model.coef_[0],
    index=X.columns
).sort_values()

coef

ext_source_3_missing          -4.140959
EXT_SOURCE_3                  -2.859135
ext_source_1_missing          -2.226680
EXT_SOURCE_2                  -1.993246
EXT_SOURCE_1                  -1.661313
days_employed_missing         -0.525166
flag_gender                   -0.362180
flag_car                      -0.301896
log_income                    -0.135253
credit_income_ratio           -0.058275
employed_years                -0.029697
CNT_FAM_MEMBERS               -0.013689
id_published_years            -0.012791
registration_years            -0.002946
age_years                      0.001190
REGION_POPULATION_RELATIVE     0.019278
has_many_children              0.132854
log_credit                     0.210070
REGION_RATING_CLIENT_W_CITY    0.211171
annuity_income_ratio           1.559288
dtype: float64

In [23]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

scaled_model = Pipeline([
    ('scaler', StandardScaler()),
    ('lr', LogisticRegression(
        class_weight='balanced',
        max_iter=10000
    ))
])

scaled_model.fit(X_train, y_train)

In [24]:
coef_scaled = pd.Series(
    scaled_model.named_steps['lr'].coef_[0],
    index=X.columns
).sort_values()

coef_scaled

EXT_SOURCE_3                  -1.802912
ext_source_3_missing          -1.660547
EXT_SOURCE_1                  -1.265731
ext_source_1_missing          -1.111442
EXT_SOURCE_2                  -0.405366
days_employed_missing         -0.200466
employed_years                -0.191859
flag_gender                   -0.172638
credit_income_ratio           -0.157542
flag_car                      -0.143600
log_income                    -0.067420
id_published_years            -0.052832
registration_years            -0.028412
CNT_FAM_MEMBERS               -0.012572
REGION_POPULATION_RELATIVE    -0.001406
age_years                      0.014080
has_many_children              0.015211
REGION_RATING_CLIENT_W_CITY    0.105236
annuity_income_ratio           0.148226
log_credit                     0.150373
dtype: float64

In [None]:
weak_features = [
    'age_years',
    'REGION_POPULATION_RELATIVE',
    'CNT_FAM_MEMBERS',
    'has_many_children'
]

X_train_reduced = X_train.drop(columns=weak_features)
X_val_reduced = X_val.drop(columns=weak_features)

scaled_model.fit(X_train_reduced, y_train)
roc_uac_reduced = roc_auc_score(
    y_val,
    scaled_model.predict_proba(X_val_reduced)[:, 1]
)

print(f'ROC-AUC: {roc_auc}, ROC-AUC reduced: {roc_uac_reduced}')

ROC-AUC: 0.7363202540176794, ROC-AUC reduced: 0.7361711363320527


ROC-AUC is slightly lower with dropped weak features; that means they have an impact on the model and shouldn't be dropped. Remained for stability and generalization of the model.