In [3]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

----
# Read in Data

In [4]:
df_historical_clean = pd.read_csv('/Users/vinh/FS/thesis/data/df_historical_clean.csv')
df_historical_clean.loc[df_historical_clean['term'] == 36, 'term'] = 0
df_historical_clean.loc[df_historical_clean['term'] == 60, 'term'] = 1
df_historical_clean.loc[df_historical_clean['loan_status'] == 'Fully Paid', 'loan_status'] = 0
df_historical_clean.loc[df_historical_clean['loan_status'] == 'Charged Off', 'loan_status'] = 1
year = pd.to_datetime(df_historical_clean['issue_d']).dt.year

In [5]:
loans_by_year = pd.DataFrame(df_historical_clean['year'].value_counts(normalize = True))
loans_by_year = loans_by_year.sort_index()

In [6]:
loans_by_year

Unnamed: 0,year
2013,0.076454
2014,0.133622
2015,0.228453
2016,0.228604
2017,0.178281
2018,0.111822
2019,0.042763


----
# Functions to Create Transition Matrix

In [7]:
def transition_df_prep(df, year1, year2):
    df_year1 = df.loc[df['year'] == year1]
    df_both_years = df.loc[df['year'].isin([year1, year2])]
    
    X_year1 = df_year1.drop('loan_status', axis = 1)
    y_year1 = df_year1[['loan_status']]
    x_train_year1, x_test_year1, y_train_year1, y_test_year1 = train_test_split(X_year1, y_year1, test_size = 0.2, random_state = 1337, stratify = y_year1)
    temp = pd.concat([y_train_year1, x_train_year1], axis = 1)
    defaults_year1 = temp.loc[temp['loan_status'] == 1]
    completed_year1 = temp.loc[temp['loan_status'] == 0]
    defaults_train_year1, defaults_test_year1 = train_test_split(defaults_year1, test_size = 0.2, random_state = 1337)
    completed_train_year1, completed_test_year1 = train_test_split(completed_year1,
                                                                   test_size = defaults_train_year1.shape[0] / completed_year1.shape[0],
                                                                   random_state = 1337)
    year1_balanced = pd.concat([completed_test_year1, defaults_train_year1])
    year1_test = pd.concat([x_test_year1, y_test_year1], axis = 1)

    X_both_years = df_both_years.drop('loan_status', axis = 1)
    y_both_years = df_both_years[['loan_status']]
    x_train_both_years, x_test_both_years, y_train_both_years, y_test_both_years = train_test_split(X_both_years, y_both_years, test_size = 0.2, random_state = 1337, stratify = y_both_years)
    temp2 = pd.concat([y_train_both_years, x_train_both_years], axis = 1)
    defaults_both_years = temp2.loc[temp2['loan_status'] == 1]
    completed_both_years = temp2.loc[temp2['loan_status'] == 0]
    defaults_train_both_years, defaults_test_both_years = train_test_split(defaults_both_years, test_size = 0.2, random_state = 1337)
    completed_train_both_years, completed_test_both_years = train_test_split(completed_both_years,
                                                                             test_size = defaults_train_both_years.shape[0] / completed_both_years.shape[0],
                                                                             random_state = 1337)
    both_years_balanced = pd.concat([completed_test_both_years, defaults_train_both_years])
    
    return year1_balanced, both_years_balanced, year1_test

In [8]:
def transition_df_pipeline(train, x_test):
    train_copy = train.copy()
    x_test_copy = x_test.copy()
    
    # Imputations
    missing_cols = list(train_copy.columns[train_copy.isnull().any()])
    imputations_df = pd.DataFrame()
    for i in missing_cols:
        new_column = pd.Series(train_copy[i].median(), name = i)
        imputations_df = pd.concat([imputations_df, new_column], axis = 1)
    
    for i in missing_cols:
        train_copy.loc[train_copy[i].isna(), i] = imputations_df[i].item()
        x_test_copy.loc[x_test_copy[i].isna(), i] = imputations_df[i].item()
        
    # Normalization
    remove = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade', 'term']
    numerical_columns = list(train_copy.select_dtypes(include = ['float64', 'int64']).columns)
    numerical_columns = [x for x in numerical_columns if x not in remove]

    scaler = MinMaxScaler()
    train_copy[numerical_columns] = scaler.fit_transform(train_copy[numerical_columns])
    x_test_copy[numerical_columns] = scaler.transform(x_test_copy[numerical_columns])

    # One hot encoding
    categorical_columns = list(train_copy.select_dtypes(include = ['object']).columns)
    categorical_columns = [x for x in categorical_columns if x not in remove]
    categorical_columns.remove('initial_list_status')
    
    transformers = [
    ('', OneHotEncoder(handle_unknown = 'ignore', sparse = False), categorical_columns)
    ]
    ct = ColumnTransformer(transformers)
    
    train_copy_transformed = ct.fit_transform(train_copy)
    feature_names = ct.get_feature_names_out()  
    train_copy_transformed = pd.DataFrame(train_copy_transformed, columns = feature_names).set_index(train_copy.index)
    train_copy_transformed.columns = train_copy_transformed.columns.str[2:]
    
    x_test_copy_transformed = ct.transform(x_test_copy)
    x_test_copy_transformed = pd.DataFrame(x_test_copy_transformed, columns = feature_names).set_index(x_test_copy.index)
    x_test_copy_transformed.columns = x_test_copy_transformed.columns.str[2:]    
    
    # Concatenate the final dataframe
    train_f = pd.concat([train_copy[remove + numerical_columns], train_copy_transformed], axis = 1)
    test_f = pd.concat([x_test_copy[remove + numerical_columns], x_test_copy_transformed], axis = 1)
    
    return train_f, test_f

In [15]:
def create_transition_matrix(year1_train, year1_test, both_years_train, both_years_test, year1, year2):
    modeling_columns_year1 = list(year1_train.columns)[6:]
    modeling_columns_both_years  = list(both_years_train)[6:]
    
    # Build models
    xgb_clf_1 = xgb.XGBClassifier(
        n_estimators = 91,
        max_depth = 6,
        gamma = 1.6323381093108538e-06,
        reg_alpha = 0.16790562748458127, 
        reg_lambda = 1.50865612415149e-06,
        subsample = 0.9988846052233982,
        colsample_bytree = 0.672719740846669,
        verbosity = 0,
        objective = 'binary:logistic',
        booster = 'gbtree',
        random_state = 7,
        n_jobs = -1
    )

    xgb_clf_2 = xgb.XGBClassifier(
        n_estimators = 91,
        max_depth = 6,
        gamma = 1.6323381093108538e-06,
        reg_alpha = 0.16790562748458127, 
        reg_lambda = 1.50865612415149e-06,
        subsample = 0.9988846052233982,
        colsample_bytree = 0.672719740846669,
        verbosity = 0,
        objective = 'binary:logistic',
        booster = 'gbtree',
        random_state = 7,
        n_jobs = -1
    )

    # Fit models
    xgb_clf_1.fit(year1_train[modeling_columns_year1], year1_train['loan_status'],
                eval_metric = 'auc',
                verbose = False
    )

    xgb_clf_2.fit(both_years_train[modeling_columns_both_years], both_years_train['loan_status'],
                eval_metric = 'auc',
                verbose = False
    )

    # Make predictions
    pred_year1 = xgb_clf_1.predict_proba(year1_test[modeling_columns_year1])[:, 1]
    pred_both_years = xgb_clf_2.predict_proba(both_years_test[modeling_columns_both_years])[:, 1]
    
    # Create the transition matrix
    def grader(year):
        if year >= 0 and year <= 0.20:
            grade = 'A' 
        elif year > 0.20 and year <= 0.40:
            grade = 'B' 
        elif year > 0.40 and year <= 0.60:
            grade = 'C' 
        elif year > 0.60 and year <= 0.80:
            grade = 'D' 
        else:
            grade = 'E'
        
        return grade
    
    transition_matrix = year1_test[['loan_status', 'id']]
    transition_matrix = pd.concat([transition_matrix, pd.Series(pred_year1, index = year1_test.index, name = year1)], axis = 1)
    transition_matrix[f"{year1}_grade"] = transition_matrix.apply(lambda row: grader(row[year1]), axis = 1)
    transition_matrix = pd.concat([transition_matrix, pd.Series(pred_both_years, index = year1_test.index, name = f"{year1}_{year2}")], axis = 1)
    transition_matrix[f"{year1}_{year2}_grade"] = transition_matrix.apply(lambda row: grader(row[f"{year1}_{year2}"]), axis = 1)

    transition_matrix_df = pd.DataFrame()
    for i in ['A', 'B', 'C', 'D', 'E']:
        temp = transition_matrix.loc[transition_matrix[f"{year1}_grade"] == i]
        transition_matrix_df = transition_matrix_df.append(temp[f"{year1}_{year2}_grade"].value_counts(normalize = True).round(3))
    transition_matrix_df = transition_matrix_df.fillna(0)
    transition_matrix_df.index = ['A', 'B', 'C', 'D', 'E']
    
    return transition_matrix_df

In [16]:
def transition_matrix(df, year1, year2):
    year1_balanced, both_years_balanced, test = transition_df_prep(df, year1, year2)
    year1_balanced_preprocessed, year1_test = transition_df_pipeline(year1_balanced, test)
    both_years_balanced_preprocessed, both_years_test = transition_df_pipeline(both_years_balanced, test)
    t_matrix = create_transition_matrix(year1_balanced_preprocessed, year1_test, both_years_balanced_preprocessed, both_years_test, str(year1), str(year2))
    return t_matrix

----
# Create Transition Matrices

In [17]:
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019]
for i in years[:-1]:
    print(f"{i} - {i+1} Transition Matrix")
    display(transition_matrix(df_historical_clean, i, i+1))

2013 - 2014 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.548,0.38,0.059,0.011,0.001
B,0.093,0.576,0.297,0.033,0.002
C,0.014,0.218,0.585,0.177,0.007
D,0.004,0.052,0.338,0.568,0.038
E,0.003,0.022,0.131,0.639,0.205


2014 - 2015 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.639,0.318,0.038,0.005,0.0
B,0.083,0.624,0.273,0.019,0.0
C,0.008,0.154,0.646,0.189,0.004
D,0.001,0.026,0.246,0.667,0.06
E,0.002,0.007,0.069,0.53,0.393


2015 - 2016 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.744,0.239,0.015,0.002,0.0
B,0.081,0.714,0.194,0.011,0.0
C,0.003,0.142,0.713,0.138,0.003
D,0.001,0.012,0.21,0.706,0.071
E,0.001,0.003,0.027,0.37,0.6


2016 - 2017 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.753,0.229,0.016,0.002,0.0
B,0.096,0.714,0.179,0.011,0.0
C,0.004,0.169,0.704,0.121,0.002
D,0.001,0.015,0.246,0.689,0.049
E,0.0,0.003,0.021,0.414,0.561


2017 - 2018 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.717,0.266,0.014,0.002,0.0
B,0.109,0.685,0.192,0.013,0.0
C,0.007,0.196,0.648,0.145,0.004
D,0.001,0.022,0.244,0.672,0.061
E,0.001,0.008,0.031,0.444,0.515


2018 - 2019 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.662,0.3,0.033,0.005,0.0
B,0.12,0.608,0.239,0.031,0.002
C,0.011,0.169,0.597,0.214,0.01
D,0.003,0.02,0.235,0.65,0.093
E,0.001,0.005,0.043,0.399,0.551
