# Lending Tree Credit Risk

### Dependencies and data

In [None]:
# Dependencies
from pathlib import Path
from collections import Counter
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from imblearn.under_sampling import RandomUnderSampler, ClusterCentroids
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.combine import SMOTEENN
from imblearn.ensemble import EasyEnsembleClassifier, BalancedRandomForestClassifier

# print(mpl.style.available)
mpl.style.use('Solarize_Light2')
%matplotlib inline

In [None]:
# Inspect top rows in data
with open(Path('data/loans_1q19.csv')) as f:
    for i in range(5):
        print(f.readline()[:100])

In [None]:
# Data
df = pd.read_csv(Path('data/loans_1q19.csv'), skiprows=1, low_memory=False)
print(df.shape)
df.head(3)

### Drop unusable data

In [None]:
# Drop rows and columns with more than 10% its values missing
df.dropna(axis=1, thresh=df.shape[0]*0.9, inplace=True) # drop cols
df.dropna(axis=0, thresh=df.shape[1]*0.9, inplace=True) # drop rows
df.shape

In [None]:
# Drop constant columns
const_cols = df.nunique()[df.nunique() < 2].index # cols w/ 1 unique val
df.drop(const_cols, axis=1, inplace=True)
df.shape

In [None]:
# Drop newly issued loans
df = df[df['loan_status'] != 'Issued']
df.shape

### Convert all columns to numeric

In [None]:
# Inspect non-numeric columns
df_num = df.copy() # make a copy
obj_cols = df_num.dtypes[df_num.dtypes == object].index
df[obj_cols].head(3)

In [None]:
""" String manipulation """

# Convert `term` to numeric
df_num['term'] = df['term'].str.replace(' months', '').astype(float)

# Convert `int_rate` to numeric
df_num['int_rate'] = df['int_rate'].str.replace('%', '').astype(float)

# Convert `emp_length` to numeric
df_num['emp_length'] = df['emp_length'].str.extract('(\d+)').astype(float)

# Convert `revol_util` to numeric
df_num['revol_util'] = df['revol_util'].str.replace('%', '').astype(float)

df_num[obj_cols].head(3)

In [None]:
""" Datetime manipulation """

# Create a new column for `issue_d` as numeric type
df_num['issue_month'] = pd.to_datetime(df['issue_d']).dt.month

# Create a new column for `earliest_cr_line` as numeric type
df_num['earliest_cr_line'] = pd.to_datetime(df['earliest_cr_line']) # convert to dt
youngest_cr = df_num['earliest_cr_line'].max() # latest date in data
df_num['oldest_cr_age'] = (youngest_cr - df_num['earliest_cr_line']).dt.days # oldest credit age

# Create a new column for 'last_credit_pull_d' as numeric type
df_num['last_credit_pull_month'] = pd.to_datetime(df['last_credit_pull_d']).dt.month
df_num['last_credit_pull_month'] = df_num['last_credit_pull_month'].replace(12, 0) # set Dec 2018 as month 0

df_num[obj_cols].head(3)

In [None]:
""" Numeric mapping """

# Convert `grade` to numeric
grade_mapping = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7} # num mapping
df_num['grade'] = df['grade'].map(grade_mapping).astype(float)

# Convert `home_ownership` to numeric
home_mapping = dict.fromkeys(['RENT', 'ANY', 'NONE'], 0) # num mapping
home_mapping.update(dict.fromkeys(['MORTGAGE', 'OWN'], 1)) # add 1 label
df_num['home_ownership'] = df['home_ownership'].map(home_mapping).astype(float)

# Convert `verification_state` to numeric
df_num['verification_status'] = df['verification_status'].str.replace('Source ', '') # combine verified labels
veri_mapping = {'Not Verified': 0, 'Verified': 1} # num mapping
df_num['verification_status'] = df['verification_status'].map(veri_mapping).astype(float)

# Create a new column for `initial_list_status` as numeric type
init_mapping = {'f': 0, 'w': 1} # num mapping
df_num['whole_loan'] = df['initial_list_status'].map(init_mapping).astype(float)

# Create a new column for `application_type` as numeric type
app_mapping = {'Individual': 0, 'Joint App': 1} # num mapping
df_num['joint_app'] = df['application_type'].map(app_mapping).astype(float)

# Create a new column for `loan_status` as numeric type
stat_mapping = dict.fromkeys(['Charged Off', 'In Grace Period', # num mapping
                              'Late (16-30 days)', 'Late (31-120 days)'], 1) # high risk
stat_mapping.update(dict.fromkeys(['Fully Paid', 'Current'], 0)) # low risk
df_num['high_risk'] = df['loan_status'].map(stat_mapping).astype(float)

df_num[obj_cols].head(3)

In [None]:
""" One-hot encoding """

# Group labels into 3 categories
df_num['purpose'] = df['purpose'].replace(['debt_consolidation', 'credit_card', 'medical'], 'debt') \
                                 .replace(['home_improvement', 'car', 'house', 'vacation'], 'major_purchase') \
                                 .replace(['small_business', 'moving', 'renewable_energy', 'other'], '_other')

# One-hot encode `purpose` and drop the last label
df_num = pd.get_dummies(df_num, columns=['purpose'], drop_first=True)
df_num.head(3)

### Additional cleaning

In [None]:
# Drop redundant cols
cols_to_drop = ['title', 'sub_grade', 'zip_code', 'issue_d', 'loan_status', 
                'earliest_cr_line', 'addr_state', 'verification_status', 'next_pymnt_d', 
                'last_credit_pull_d', 'initial_list_status', 'application_type']
df_num.drop(cols_to_drop, axis=1, inplace=True)
df_num.shape

In [None]:
# Drop rows with missing values
df_num.dropna(inplace=True)
df_num.shape

In [None]:
# Find highly correlated columns
cor_cols = []
for i in range(df_num.shape[1] - 1):
    for j in range(i + 1, df_num.shape[1]):
        col1 = df_num.iloc[:, i]
        col2 = df_num.iloc[:, j]
        cor = col1.corr(col2)
        if abs(cor) > 0.7:
            print(col1.name, col2.name, cor)
            if col2.name not in cor_cols:
                cor_cols.append(col2.name)
                
cor_cols

In [None]:
# Drop highly correlated columns, keeping only 1 column per correlation
df_num.drop(cor_cols, axis=1, inplace=True)
df_num.shape

In [None]:
# Find integer columns
df_int = df_num % 1
int_cols = df_int.nunique()[df_int.nunique() < 2].index

# Convert above columns to integer type
for col in int_cols:
    df_num[col] = df_num[col].astype(int)
    
df_num.info()

In [None]:
# Reset index
df_num.reset_index(drop=True, inplace=True)
df_num.head(3)

### Data preprocessing

In [None]:
# Count target labels
df_num['high_risk'].value_counts()

In [None]:
# Feature/target split
X = df_num.drop('high_risk', axis=1).copy()
y = df_num['high_risk'].copy()

# Train/validation/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, stratify=y_test, random_state=42)
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

In [None]:
# Count target labels in each set
print(y_train.value_counts())
print(y_val.value_counts())
print(y_test.value_counts())

In [None]:
# # Scale data
# scaler = StandardScaler()
# X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
# X_val_scaled = pd.DataFrame(scaler.transform(X_val), columns=X_val.columns)
# X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)
# X_train_scaled.head(3)

### Baseline machine learning

In [None]:
# Logistic regression
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

# Evaluate model
lr_val_pred = lr.predict(X_val)
print(classification_report(y_val, lr_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

In [None]:
# Decision tree
tree = DecisionTreeClassifier(random_state=42)
tree.fit(X_train, y_train)

# Evaluate model
tree_val_pred = tree.predict(X_val)
print(classification_report(y_val, tree_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

### Undersampling

In [None]:
# Random undersampling
rus = RandomUnderSampler(random_state=42)
X_rus, y_rus = rus.fit_resample(X_train, y_train)
Counter(y_rus)

In [None]:
# Logistic regression
lr_rus = LogisticRegression(random_state=42)
lr_rus.fit(X_rus, y_rus)

# Evaluate model
lr_rus_val_pred = lr_rus.predict(X_val)
print(classification_report(y_val, lr_rus_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_rus_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_rus_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

In [None]:
# Decision tree
tree_rus = DecisionTreeClassifier(random_state=42)
tree_rus.fit(X_rus, y_rus)

# Evaluate model
tree_rus_val_pred = tree_rus.predict(X_val)
print(classification_report(y_val, tree_rus_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_rus_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_rus_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

In [None]:
# Cluster centroid undersampling
ccu = ClusterCentroids(random_state=42)
X_ccu, y_ccu = ccu.fit_resample(X_train, y_train)
Counter(y_ccu)

In [None]:
# Logistic regression
lr_ccu = LogisticRegression(random_state=42)
lr_ccu.fit(X_ccu, y_ccu)

# Evaluate model
lr_ccu_val_pred = lr_ccu.predict(X_val)
print(classification_report(y_val, lr_ccu_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_ccu_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_ccu_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

In [None]:
# Decision tree
tree_ccu = DecisionTreeClassifier(random_state=42)
tree_ccu.fit(X_ccu, y_ccu)

# Evaluate model
tree_ccu_val_pred = tree_ccu.predict(X_val)
print(classification_report(y_val, tree_ccu_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_ccu_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_ccu_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

### Oversampling

In [None]:
# Random oversampling
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_resample(X_train, y_train)
Counter(y_ros)

In [None]:
# Logistic regression
lr_ros = LogisticRegression(random_state=42)
lr_ros.fit(X_ros, y_ros)

# Evaluate model
lr_ros_val_pred = lr_ros.predict(X_val)
print(classification_report(y_val, lr_ros_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_ros_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_ros_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

In [None]:
# Decision tree
tree_ros = DecisionTreeClassifier(random_state=42)
tree_ros.fit(X_ros, y_ros)

# Evaluate model
tree_ros_val_pred = tree_ros.predict(X_val)
print(classification_report(y_val, tree_ros_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_ros_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_ros_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

In [None]:
# SMOTE
smo = SMOTE(random_state=42)
X_smo, y_smo = smo.fit_resample(X_train, y_train)
Counter(y_smo)

In [None]:
# Logistic regression
lr_smo = LogisticRegression(random_state=42)
lr_smo.fit(X_smo, y_smo)

# Evaluate model
lr_smo_val_pred = lr_smo.predict(X_val)
print(classification_report(y_val, lr_smo_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_smo_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_smo_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

In [None]:
# Decision tree
tree_smo = DecisionTreeClassifier(random_state=42)
tree_smo.fit(X_smo, y_smo)

# Evaluate model
tree_smo_val_pred = tree_smo.predict(X_val)
print(classification_report(y_val, tree_smo_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_smo_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_smo_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

### Combination sampling

In [None]:
# SMOTEENN
sen = SMOTEENN(random_state=42)
X_sen, y_sen = sen.fit_resample(X_train, y_train)
Counter(y_sen)

In [None]:
# Logistic regression
lr_sen = LogisticRegression(random_state=42)
lr_sen.fit(X_sen, y_sen)

# Evaluate model
lr_sen_val_pred = lr_sen.predict(X_val)
print(classification_report(y_val, lr_sen_val_pred))
print('Accuracy:', accuracy_score(y_val, lr_sen_val_pred))
pd.DataFrame(confusion_matrix(y_val, lr_sen_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

In [None]:
# Decision tree
tree_sen = DecisionTreeClassifier(random_state=42)
tree_sen.fit(X_sen, y_sen)

# Evaluate model
tree_sen_val_pred = tree_sen.predict(X_val)
print(classification_report(y_val, tree_sen_val_pred))
print('Accuracy:', accuracy_score(y_val, tree_sen_val_pred))
pd.DataFrame(confusion_matrix(y_val, tree_sen_val_pred), index=['0', '1'], columns=['Predicted 0', 'Predicted 1'])

### Sampling comparison

In [None]:
def clf_report(y_true, y_pred):
    
    """
    Using the confusion matrix and classification report, create a custom classification report 
    with the following values: true positives, false negatives, false positives, true negatives, 
    and the macro average F1 score, as well as the precision, recall, and F1 score for both classes. 
    In this case, the positive class is labeled 0 (low risk) and the negative class is labeled 1 
    (high risk).
    
    Parameters
    ----------
    y_true : list-like
        True target labels
    y_pred : list-like
        Predicted target labels
    
    Returns
    -------
    Dict
        Custom classification report with the 11 listed values
    """
    
    # Confusion matrix and classification report
    confusion_mat = confusion_matrix(y_true, y_pred)
    clf_rep = classification_report(y_true, y_pred, output_dict=True)
    
    # Report values: true 0, false 1, false 0, true 1, F1 for 0, F1 for 1, accuracy
    report = confusion_mat.ravel().tolist()
    report.extend([clf_rep['0']['precision'], clf_rep['0']['recall'], clf_rep['0']['f1-score'], 
                   clf_rep['1']['precision'], clf_rep['1']['recall'], clf_rep['1']['f1-score'], 
                   clf_rep['macro avg']['f1-score']])
    
    # Add report keys
    keys = ['true_pos', 'false_neg', 'false_pos', 'true_neg', 
            'precision_pos', 'recall_pos', 'f1_pos', 
            'precision_neg', 'recall_neg', 'f1_neg', 'f1_avg']
    report = dict(zip(keys, report))
    return report
    

# Test function
clf_report(y_val, lr_val_pred)

In [None]:
# Logistic regression comparison
trials = ['base', 'rand_undersamp', 'centroid_undersamp', 'rand_oversamp', 'smote', 'smoteenn']
lr_preds = [lr_val_pred, lr_rus_val_pred, lr_ccu_val_pred, lr_ros_val_pred, lr_smo_val_pred, lr_sen_val_pred]
lr_reports = [clf_report(y_val, y_pred) for y_pred in lr_preds]
lr_reports_df = pd.DataFrame(lr_reports, index=trials)
lr_reports_df

In [None]:
# Decision tree comparison
tree_preds = [tree_val_pred, tree_rus_val_pred, tree_ccu_val_pred, 
              tree_ros_val_pred, tree_smo_val_pred, tree_sen_val_pred]
tree_reports = [clf_report(y_val, y_pred) for y_pred in tree_preds]
tree_reports_df = pd.DataFrame(tree_reports, index=trials)
tree_reports_df

### Features

The decision tree model with random undersampling had the highest precision on high-risk loans and the highest average F1 score, but it missed about 80% of the high-risk loans. On the other hand, the logistic regression models with oversampling were the best at catching high-risk loans, but came with a trade-off of a high number of false negatives.

In [None]:
# Scaled feature coefficients for oversampled logistic regression
lr_feats = sorted(zip(np.abs(lr_ros.coef_[0]) / np.abs(lr_ros.coef_[0]).sum(), X_train.columns), reverse=True)
lr_feats1 = [feat for feat in lr_feats if feat[0] > 0.01] # feats with greater than 1% relative coefficient
print(len(lr_feats1))
lr_feats1

In [None]:
# Scaled feature coefficients for undersampled decision tree
tree_feats = sorted(
    zip(np.abs(tree_rus.feature_importances_) / np.abs(tree_rus.feature_importances_).sum(), X_train.columns), 
    reverse=True)
tree_feats1 = [feat for feat in tree_feats if feat[0] > 0.01] # feats with greater than 1% relative coefficient
print(len(tree_feats1))
tree_feats1

In [None]:
# Features in both sets
feats = np.intersect1d(np.array(lr_feats1)[:, 1], np.array(tree_feats1)[:, 1])
print(len(feats))
feats