In [8]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer

In [2]:
path = '../home-credit-default-risk/'
df = pd.read_csv(path + 'application_train.csv')

In [3]:
def prepare_data(df, drop=False):
    df_temp = df.copy()
    
    if drop:
        missing = df_temp.isna().mean()
        drop_cols = missing[missing > 0.5].index
        df_temp = df_temp.drop(columns=drop_cols)
    
    y = df_temp['TARGET']
    X = df_temp.drop('TARGET', axis=1)
    
    X = pd.get_dummies(X, dummy_na=True)
    
    # because of NaNs we need to use the imputer
    imputer = SimpleImputer(strategy='median')
    x_imputed = imputer.fit_transform(X)
    
    scaler = StandardScaler()
    x_scaled = scaler.fit_transform(x_imputed)
    
    return x_scaled, y

In [10]:
def train_and_eval(X, y, model_type='logistic'):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

    if model_type == 'logistic':
        model = LogisticRegression(max_iter=100)
    elif model_type == 'random_forest':
        model = RandomForestClassifier(
            n_estimators=100,
            max_depth=10,
            random_state=42,
            n_jobs=-1
        )
    elif model_type == 'gradient_boosting':
        model = GradientBoostingClassifier(
            n_estimators=100,
            max_depth=5,
            learning_rate=0.1,
            random_state=42,
        )
    else:
        raise ValueError(f"Unknown model type: {model_type}")

    model.fit(X_train, y_train)

    preds = model.predict_proba(X_test)[:, 1]
    score = roc_auc_score(y_test, preds)
    
    return score

Logistic regression

Without droping any columns

In [13]:
X1, y1 = prepare_data(df, drop=False)
score_1 = train_and_eval(X1, y1)
print(f"ROC AUC: {score_1:.4f}")

ROC AUC: 0.7484


With droping columns that are >50% null

In [14]:
X2, y2 = prepare_data(df, drop=True)
score_2 = train_and_eval(X2, y2)
print(f"ROC AUC: {score_2:.4f}")

ROC AUC: 0.7451


Ensemble methods

In [15]:
score_3 = train_and_eval(X1, y1, 'random_forest')
print(f"ROC AUC: {score_3:.4f}")

ROC AUC: 0.7364


In [16]:
score_4 = train_and_eval(X1, y1, 'gradient_boosting')
print(f"ROC AUC: {score_4:.4f}")

ROC AUC: 0.7570


We see that baseline model - logistic regression even without any feature engineering and without any feature selection is getting roc auc of 0.74.

Random forest achieved worse score of 0.73.

Gradient boosting achieved the best score of almost 0.76.

On kaggle competition the best score is 0.806.