In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
path = '../home-credit-default-risk/'
df = pd.read_csv(path + 'application_train.csv')

In [None]:
def prepare_data(df, drop=False):
    df_temp = df.copy()
    
    if drop:
        missing = df_temp.isna().mean()
        drop_cols = missing[missing > 0.5].index
        df_temp = df_temp.drop(columns=drop_cols)
    
    y = df_temp['TARGET']
    X = df_temp.drop('TARGET', axis=1)
    
    X = pd.get_dummies(X, dummy_na=True)
    
    # because of NaNs we need to use the imputer
    imputer = SimpleImputer(strategy='median')
    x_imputed = imputer.fit_transform(X)
    
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x_imputed)
    
    return x_scaled, y

In [None]:
def train_and_eval(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    model = LogisticRegression(max_iter=100)
    model.fit(X_train, y_train)
    
    preds = model.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, preds)
    
    return score

without droping any columns

In [8]:
X1, y1 = prepare_data(df, drop=False)
score_1 = train_and_eval(X1, y1)
print(f"ROC AUC: {score_1}")

ROC AUC: 0.7484211440535123


with droping columns that are >50% null

In [9]:
X2, y2 = prepare_data(df, drop=True)
score_2 = train_and_eval(X2, y2)
print(f"ROC AUC: {score_2}")

ROC AUC: 0.7450818398142119


We see that baseline model - logistic regression even without any feature engineering and without any feature selection is getting roc auc of 0.74. 