# A predictive Loan model

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
%matplotlib inline

In [2]:
loan = pd.read_csv('data/loan_data.csv')

In [3]:
loan['monthly_loan_payment_amount'] = (loan['loan_amnt'] * (loan['loan_int_rate'] / 100)) / 12
loan.monthly_loan_payment_amount = loan.monthly_loan_payment_amount.apply(lambda x: float(f"{x:.2f}"))

loan['monthly_loan_percent_income'] = ((loan['monthly_loan_payment_amount'] * 100) / (loan['person_income'] / 12)) / 100
loan.monthly_loan_percent_income = loan.monthly_loan_percent_income.apply(lambda x: float(f"{x:.2f}"))

In [4]:
from sklearn.model_selection import train_test_split

y = loan.loan_status
X = loan.drop(columns=['loan_status'], axis=1)

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
print(X.columns)

Index(['person_age', 'person_gender', 'person_education', 'person_income',
       'person_emp_exp', 'person_home_ownership', 'loan_amnt', 'loan_intent',
       'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length',
       'credit_score', 'previous_loan_defaults_on_file',
       'monthly_loan_payment_amount', 'monthly_loan_percent_income'],
      dtype='object')


In [27]:
ordinal_cols = ['person_education', 'person_home_ownership']
categorical_cols = [col for col in X.columns if X[col].dtype == 'object' and col not in ordinal_cols]
numerical_cols = [col for col in X.columns if X[col].dtype != 'object']

In [28]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

transformer = ColumnTransformer(transformers=[
    ('ordinal', OrdinalEncoder(), ordinal_cols),
    ('onehot', OneHotEncoder(handle_unknown="ignore"), categorical_cols),
    ('numerical', 'passthrough', numerical_cols)
])

model = XGBClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=4, early_stopping_rounds=5, verbose=False, random_state=42)

pipeline = Pipeline(steps=[
    ('preprocess', transformer),
    ('classifier', model)
])

transformer.fit(X_train)
X_valid_transformed = transformer.transform(X_valid)

pipeline.fit(X_train, y_train, classifier__eval_set=[(X_valid_transformed, y_valid)])

[0]	validation_0-logloss:0.50040
[1]	validation_0-logloss:0.47457
[2]	validation_0-logloss:0.45216
[3]	validation_0-logloss:0.43182
[4]	validation_0-logloss:0.41399
[5]	validation_0-logloss:0.39791
[6]	validation_0-logloss:0.38331
[7]	validation_0-logloss:0.37030
[8]	validation_0-logloss:0.35831
[9]	validation_0-logloss:0.34709
[10]	validation_0-logloss:0.33712
[11]	validation_0-logloss:0.32752
[12]	validation_0-logloss:0.31866
[13]	validation_0-logloss:0.31058
[14]	validation_0-logloss:0.30302
[15]	validation_0-logloss:0.29583
[16]	validation_0-logloss:0.28902
[17]	validation_0-logloss:0.28259
[18]	validation_0-logloss:0.27659
[19]	validation_0-logloss:0.27098
[20]	validation_0-logloss:0.26584
[21]	validation_0-logloss:0.26085
[22]	validation_0-logloss:0.25618
[23]	validation_0-logloss:0.25189
[24]	validation_0-logloss:0.24778
[25]	validation_0-logloss:0.24388
[26]	validation_0-logloss:0.24032
[27]	validation_0-logloss:0.23688
[28]	validation_0-logloss:0.23365
[29]	validation_0-loglos

Parameters: { "verbose" } are not used.

  self.starting_round = model.num_boosted_rounds()


[64]	validation_0-logloss:0.18155
[65]	validation_0-logloss:0.18069
[66]	validation_0-logloss:0.18002
[67]	validation_0-logloss:0.17938
[68]	validation_0-logloss:0.17882
[69]	validation_0-logloss:0.17839
[70]	validation_0-logloss:0.17791
[71]	validation_0-logloss:0.17751
[72]	validation_0-logloss:0.17690
[73]	validation_0-logloss:0.17656
[74]	validation_0-logloss:0.17615
[75]	validation_0-logloss:0.17584
[76]	validation_0-logloss:0.17547
[77]	validation_0-logloss:0.17498
[78]	validation_0-logloss:0.17465
[79]	validation_0-logloss:0.17423
[80]	validation_0-logloss:0.17370
[81]	validation_0-logloss:0.17340
[82]	validation_0-logloss:0.17310
[83]	validation_0-logloss:0.17289
[84]	validation_0-logloss:0.17258
[85]	validation_0-logloss:0.17215
[86]	validation_0-logloss:0.17198
[87]	validation_0-logloss:0.17178
[88]	validation_0-logloss:0.17156
[89]	validation_0-logloss:0.17121
[90]	validation_0-logloss:0.17100
[91]	validation_0-logloss:0.17076
[92]	validation_0-logloss:0.17057
[93]	validatio

In [29]:
from sklearn.metrics import f1_score

pred = pipeline.predict(X_valid)
f1_score(y_valid, pred)


0.8443508032657361

In [42]:
def get_important_features(pipeline) -> pd.DataFrame:
    feature_names = (pipeline.named_steps['preprocess']).get_feature_names_out()
    importances = pipeline.named_steps['classifier'].feature_importances_

    df_importances = pd.DataFrame({
        'feature_name': feature_names,
        'importance': importances
    })

    df_importances = df_importances.sort_values(by="importance", ascending=False)

    return df_importances

In [43]:
get_important_features(pipeline)

Unnamed: 0,feature_name,importance
10,onehot__previous_loan_defaults_on_file_No,0.608127
21,numerical__monthly_loan_percent_income,0.252902
1,ordinal__person_home_ownership,0.022713
17,numerical__loan_percent_income,0.017048
9,onehot__loan_intent_VENTURE,0.015394
16,numerical__loan_int_rate,0.01477
4,onehot__loan_intent_DEBTCONSOLIDATION,0.01211
6,onehot__loan_intent_HOMEIMPROVEMENT,0.011625
13,numerical__person_income,0.009548
7,onehot__loan_intent_MEDICAL,0.00805


In [39]:
import joblib

joblib.dump(pipeline, 'loan_analyzer.pkl')

['loan_analyzer.pkl']

## History of scores
1. 0.8378735784184078
2. 0.8443508032657361

