In [8]:
from pathlib import Path
import pandas as pd
import numpy as np


PROJECT_ROOT = Path.cwd().parent
PROCESSED_PATH = PROJECT_ROOT / 'data/processed/application_features_baseline.csv'
RANDOM_SEED = 42

INTERIM_PATH = PROJECT_ROOT / 'data/interim'
INTERIM_BUREAU = INTERIM_PATH / 'bureau_agg.csv'

### Preprocessing data before fiting Logistic regression

---

In [5]:
df = pd.read_csv(PROCESSED_PATH)


In [9]:
nan_df = pd.DataFrame({
    'Count': df.isna().sum(),
    'pct': df.isna().mean() * 100
})

nan_df = nan_df[nan_df['Count'] > 0].sort_values('pct', ascending=False).reset_index()
nan_df

Unnamed: 0,index,Count,pct
0,annuity_mean,227502,73.981744
1,prepaid_ratio_avg,215280,70.007252
2,loan_duration_max,215280,70.007252
3,paid_in_time_avg,215280,70.007252
4,bad_dpd_times_max,215280,70.007252
...,...,...,...
100,AMT_GOODS_PRICE,278,0.090403
101,annuity_income_ratio,12,0.003902
102,AMT_ANNUITY,12,0.003902
103,CNT_FAM_MEMBERS,2,0.000650


Fill missing values for clients without credit history

In [12]:
bureau_df = pd.read_csv(INTERIM_BUREAU)
bureau_related_columns = bureau_df.drop(columns='SK_ID_CURR').columns
pos_bureau_related_columns = bureau_df[bureau_related_columns].columns[bureau_df[bureau_related_columns].min() >= 0]

df['has_credit_history'] = df['first_credit_time'].notna().astype(int)

df[bureau_related_columns] = df[bureau_related_columns].fillna(-1)

# try to impute not positive columns differently further

TypeError: '>=' not supported between instances of 'str' and 'int'

Fill missing values for cliens without enquiries to Credit Bureau

In [None]:
df['has_credit_enquiries'] = bureau_df['AMT_REQ_CREDIT_BUREAU_YEAR'].notna().astype(int)

enquiries_columns = ['AMT_REQ_CREDIT_BUREAU_HOUR', 'AMT_REQ_CREDIT_BUREAU_DAY', 'AMT_REQ_CREDIT_BUREAU_WEEK', 'AMT_REQ_CREDIT_BUREAU_MON', 'AMT_REQ_CREDIT_BUREAU_QRT', 'AMT_REQ_CREDIT_BUREAU_YEAR']
df[enquiries_columns] = df[enquiries_columns].fillna(-1)

In [4]:
df['credit_annuity_ratio'] = df['AMT_ANNUITY'] / df['AMT_CREDIT']
df['credit_over_goods_price'] = df['AMT_CREDIT'] - df['AMT_GOODS_PRICE']
df['days_without_work'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']
df['credit_per_family_member'] = df['AMT_CREDIT'] / df['CNT_FAM_MEMBERS']
df['annuity_per_family_member'] = df['AMT_ANNUITY'] / df['CNT_FAM_MEMBERS']
df['income_per_family_member'] = df['AMT_INCOME_TOTAL'] / df['CNT_FAM_MEMBERS']

In [None]:
X = df.drop(columns='TARGET')
y = df['TARGET']

### Splitting data and fitting the model

---

In [6]:
from sklearn.model_selection import train_test_split

X_train, x_, y_train, y_ = train_test_split(
    X, y,
    test_size=0.3,
    random_state=RANDOM_SEED,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    x_, y_,
    test_size=0.3,
    random_state=RANDOM_SEED,
    stratify=y_
)

Stratification preseves default rate distribution among validation/train/test splits.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.compose import ColumnTransformer

lr_pipeline = Pipeline([
    ('preprocess', ColumnTransformer([
        
    ]))
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('model', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        n_jobs=-1
    ))
])



In [8]:
from sklearn.metrics import roc_auc_score, recall_score, precision_score

def perform_calc(model, columns):

    model.fit(X_train[columns], y_train)
    
    val_pred_proba = lr_pipeline.predict_proba(X_val[columns])[:, 1]
    train_pred_proba = lr_pipeline.predict_proba(X_train[columns])[:, 1]

    roc_auc_val = roc_auc_score(y_val, val_pred_proba)
    roc_auc_train = roc_auc_score(y_train, train_pred_proba)

    print(f'Train ROC-AUC: {roc_auc_train}, validation ROC-AUC: {roc_auc_val}')

In [9]:
# lr_pipeline.fit(X_train, y_train)

# selected_mask = lr_pipeline.named_steps['selector'].get_support()
# selected_features = X_train.columns[selected_mask]

# selected_features

In [10]:
perform_calc(lr_pipeline, X.columns)

Train ROC-AUC: 0.7518034274898198, validation ROC-AUC: 0.7538704788100641


In [9]:
thresholds = np.linspace(0.1, 0.9, 9)
results = []
val_pred_proba = lr_pipeline.predict_proba(X_val)[:, 1]

for t in thresholds:
    t_pred = (val_pred_proba >= t).astype(int)
    results.append({
        'threshold': t,
        'recall': recall_score(y_val, t_pred),
        'precision': precision_score(y_val, t_pred)
    })

pd.DataFrame(results)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- AMT_CREDIT
- AMT_INCOME_TOTAL


Higher threshold -> lower default risk, but less approvals

Lower threshold -> higher default risk, but more approvals