# Loan Status

Data source: https://www.kaggle.com/zaurbegiev/my-dataset#credit_train.csv

### Dependencies and data

In [63]:
# Dependencies
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from tensorflow import keras

%matplotlib inline

In [64]:
# Data
loan_df = pd.read_csv('data/loan_status.csv')
loan_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36423 entries, 0 to 36422
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Loan_Status                   36423 non-null  object 
 1   Current_Loan_Amount           36423 non-null  int64  
 2   Term                          36423 non-null  object 
 3   Credit_Score                  36423 non-null  float64
 4   Annual_Income                 36423 non-null  float64
 5   Years_in_current_job          36423 non-null  object 
 6   Home_Ownership                36423 non-null  object 
 7   Purpose                       36423 non-null  object 
 8   Monthly_Debt                  36423 non-null  float64
 9   Years_of_Credit_History       36423 non-null  float64
 10  Months_since_last_delinquent  36423 non-null  float64
 11  Number_of_Open_Accounts       36423 non-null  int64  
 12  Number_of_Credit_Problems     36423 non-null  int64  
 13  C

In [65]:
loan_df.head()

Unnamed: 0,Loan_Status,Current_Loan_Amount,Term,Credit_Score,Annual_Income,Years_in_current_job,Home_Ownership,Purpose,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,Bankruptcies,Tax_Liens
0,Fully_Paid,99999999,Short_Term,741.0,2231892.0,8_years,Own_Home,Debt_Consolidation,29200.53,14.9,29.0,18,1,297996,750090.0,0.0,0.0
1,Fully_Paid,217646,Short_Term,730.0,1184194.0,<_1_year,Home_Mortgage,Debt_Consolidation,10855.08,19.6,10.0,13,1,122170,272052.0,1.0,0.0
2,Fully_Paid,548746,Short_Term,678.0,2559110.0,2_years,Rent,Debt_Consolidation,18660.28,22.6,33.0,4,0,437171,555038.0,0.0,0.0
3,Fully_Paid,99999999,Short_Term,728.0,714628.0,3_years,Rent,Debt_Consolidation,11851.06,16.0,76.0,16,0,203965,289784.0,0.0,0.0
4,Fully_Paid,99999999,Short_Term,740.0,776188.0,<_1_year,Own_Home,Debt_Consolidation,11578.22,8.5,25.0,6,0,134083,220220.0,0.0,0.0


### Data preprocessing

In [66]:
# Inspect `Current_Loan_Amount` outliers
loan_df.loc[loan_df['Current_Loan_Amount'] > 1e6, 'Current_Loan_Amount'].value_counts()

99999999    4918
Name: Current_Loan_Amount, dtype: int64

In [67]:
# Drop outliers
loan_df = loan_df[loan_df['Current_Loan_Amount'] < 1e6].reset_index()
loan_df.shape

(31505, 17)

In [68]:
# Categorical features
cat_feats = loan_df.dtypes[loan_df.dtypes == object].index.tolist()

# Unique counts
loan_df[cat_feats].nunique()

Loan_Status              2
Term                     2
Years_in_current_job    11
Home_Ownership           4
Purpose                  7
dtype: int64

In [69]:
# Inspect unique values for `Years_in_current_job`
loan_df['Years_in_current_job'].value_counts()

10+_years    11376
2_years       2778
3_years       2594
<_1_year      2331
5_years       2154
1_year        1974
4_years       1965
6_years       1822
7_years       1806
8_years       1437
9_years       1268
Name: Years_in_current_job, dtype: int64

In [70]:
def bucket_years(label):
    
    """ Group a `Years_in_current_job` label into 1 of 3 buckets: 0 - 3, 4 - 9, and 10+ """
    
    if '+' not in label:
        if int(label[0]) < 4:
            return '0-3'
        else:
            return '4-9'
    else:
        return '10+'

# Group `Years_in_current_job` into fewer buckets
loan_df['Years_in_current_job'] = loan_df['Years_in_current_job'].str.replace('<_', '') \
                                                                 .apply(bucket_years)
loan_df['Years_in_current_job'].value_counts()

10+    11376
4-9    10452
0-3     9677
Name: Years_in_current_job, dtype: int64

In [71]:
# One-hot encoding
ohe = OneHotEncoder(sparse=False)
loan_ohe = ohe.fit_transform(loan_df[cat_feats])
loan_ohe = pd.DataFrame(loan_ohe, columns=ohe.get_feature_names(cat_feats))
loan_ohe.head()

Unnamed: 0,Loan_Status_Fully_Paid,Loan_Status_Not_Paid,Term_Long_Term,Term_Short_Term,Years_in_current_job_0-3,Years_in_current_job_10+,Years_in_current_job_4-9,Home_Ownership_HaveMortgage,Home_Ownership_Home_Mortgage,Home_Ownership_Own_Home,Home_Ownership_Rent,Purpose_Business_Loan,Purpose_Buy_House,Purpose_Buy_a_Car,Purpose_Debt_Consolidation,Purpose_Home_Improvements,Purpose_Medical_Bills,Purpose_Other
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [73]:
# Merge data
df = loan_df.drop(columns=cat_feats) \
            .merge(loan_ohe.drop(columns=['Loan_Status_Not_Paid', 'Term_Short_Term']), 
                   left_index=True, right_index=True)
df.head()

Unnamed: 0,Current_Loan_Amount,Credit_Score,Annual_Income,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,...,Home_Ownership_Home_Mortgage,Home_Ownership_Own_Home,Home_Ownership_Rent,Purpose_Business_Loan,Purpose_Buy_House,Purpose_Buy_a_Car,Purpose_Debt_Consolidation,Purpose_Home_Improvements,Purpose_Medical_Bills,Purpose_Other
1,217646,730.0,1184194.0,10855.08,19.6,10.0,13,1,122170,272052.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
2,548746,678.0,2559110.0,18660.28,22.6,33.0,4,0,437171,555038.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
5,234124,727.0,693234.0,14211.24,24.7,46.0,10,1,28291,107052.0,...,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
6,666204,723.0,1821967.0,17612.24,22.0,34.0,15,0,813694,2004618.0,...,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
7,317108,687.0,1133274.0,9632.81,17.4,53.0,4,0,60287,126940.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0


In [74]:
# Feature/target split
X = df.drop(columns='Loan_Status_Fully_Paid')
y = df['Loan_Status_Fully_Paid']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=24)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((20442, 27), (6815, 27), (20442,), (6815,))

In [75]:
# Scale data
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
X_train_scaled.head()

Unnamed: 0,Current_Loan_Amount,Credit_Score,Annual_Income,Monthly_Debt,Years_of_Credit_History,Months_since_last_delinquent,Number_of_Open_Accounts,Number_of_Credit_Problems,Current_Credit_Balance,Maximum_Open_Credit,...,Home_Ownership_Home_Mortgage,Home_Ownership_Own_Home,Home_Ownership_Rent,Purpose_Business_Loan,Purpose_Buy_House,Purpose_Buy_a_Car,Purpose_Debt_Consolidation,Purpose_Home_Improvements,Purpose_Medical_Bills,Purpose_Other
0,-1.213104,-0.250923,-0.321829,-0.780211,-0.794393,-1.411889,-1.105349,-0.348698,-0.803736,-0.117581,...,-1.019064,-0.311019,1.22497,-0.131022,-0.084228,-0.108073,0.538631,-0.263055,-0.110593,-0.356233
1,-0.486575,3.828646,-0.31309,-1.307016,-0.59853,-0.954129,-0.904078,-0.348698,-0.628872,-0.140259,...,-1.019064,-0.311019,1.22497,-0.131022,-0.084228,-0.108073,0.538631,-0.263055,-0.110593,-0.356233
2,1.169124,-0.264465,0.294734,-0.686847,-1.321717,-1.045681,1.108638,1.509512,0.341615,0.074277,...,-1.019064,3.21524,-0.816347,-0.131022,-0.084228,-0.108073,0.538631,-0.263055,-0.110593,-0.356233
3,-0.936584,-0.261241,-0.35826,-1.51123,-1.35185,-1.137233,-1.507892,-0.348698,-0.817763,-0.182862,...,-1.019064,-0.311019,1.22497,-0.131022,-0.084228,-0.108073,0.538631,-0.263055,-0.110593,-0.356233
4,-0.532957,-0.250278,-0.562899,-0.418065,0.57665,-0.542144,-0.098992,-0.348698,-0.426252,-0.122982,...,-1.019064,-0.311019,1.22497,-0.131022,-0.084228,-0.108073,0.538631,-0.263055,-0.110593,-0.356233
