In [317]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings("ignore")

----
# Read in Data

In [12]:
df_historical_clean = pd.read_csv('/Users/vinh/FS/thesis/data/df_historical_clean.csv')
df_historical_clean.loc[df_historical_clean['term'] == 36, 'term'] = 0
df_historical_clean.loc[df_historical_clean['term'] == 60, 'term'] = 1
df_historical_clean.loc[df_historical_clean['loan_status'] == 'Fully Paid', 'loan_status'] = 0
df_historical_clean.loc[df_historical_clean['loan_status'] == 'Charged Off', 'loan_status'] = 1
year = pd.to_datetime(df_historical_clean['issue_d']).dt.year
df_historical_clean.insert(2, 'year', year)

In [23]:
loans_by_year = pd.DataFrame(df_historical_clean['year'].value_counts(normalize = True))
loans_by_year = loans_by_year.sort_index()

In [311]:
loans_by_year

Unnamed: 0,year
2007,0.000135
2008,0.000839
2009,0.002534
2010,0.0062
2011,0.011673
2012,0.02868
2013,0.072451
2014,0.126625
2015,0.21649
2016,0.216634


----
# Functions to Create Transition Matrix

In [131]:
def transition_df_prep(df, year1, year2):
    df_year1 = df.loc[df['year'] == year1]
    df_both_years = df.loc[df['year'].isin([year1, year2])]
    
    X_year1 = df_year1.drop('loan_status', axis = 1)
    y_year1 = df_year1[['loan_status']]
    x_train_year1, x_test_year1, y_train_year1, y_test_year1 = train_test_split(X_year1, y_year1, test_size = 0.2, random_state = 1337, stratify = y_year1)
    temp = pd.concat([y_train_year1, x_train_year1], axis = 1)
    defaults_year1 = temp.loc[temp['loan_status'] == 1]
    completed_year1 = temp.loc[temp['loan_status'] == 0]
    defaults_train_year1, defaults_test_year1 = train_test_split(defaults_year1, test_size = 0.2, random_state = 1337)
    completed_train_year1, completed_test_year1 = train_test_split(completed_year1,
                                                                   test_size = defaults_train_year1.shape[0] / completed_year1.shape[0],
                                                                   random_state = 1337)
    year1_balanced = pd.concat([completed_test_year1, defaults_train_year1])
    year1_test = pd.concat([x_test_year1, y_test_year1], axis = 1)

    X_both_years = df_both_years.drop('loan_status', axis = 1)
    y_both_years = df_both_years[['loan_status']]
    x_train_both_years, x_test_both_years, y_train_both_years, y_test_both_years = train_test_split(X_both_years, y_both_years, test_size = 0.2, random_state = 1337, stratify = y_both_years)
    temp2 = pd.concat([y_train_both_years, x_train_both_years], axis = 1)
    defaults_both_years = temp2.loc[temp2['loan_status'] == 1]
    completed_both_years = temp2.loc[temp2['loan_status'] == 0]
    defaults_train_both_years, defaults_test_both_years = train_test_split(defaults_both_years, test_size = 0.2, random_state = 1337)
    completed_train_both_years, completed_test_both_years = train_test_split(completed_both_years,
                                                                             test_size = defaults_train_both_years.shape[0] / completed_both_years.shape[0],
                                                                             random_state = 1337)
    both_years_balanced = pd.concat([completed_test_both_years, defaults_train_both_years])
    
    return year1_balanced, both_years_balanced, year1_test

In [133]:
def transition_df_pipeline(train, x_test):
    train_copy = train.copy()
    x_test_copy = x_test.copy()
    
    # Imputations
    missing_cols = list(train_copy.columns[train_copy.isnull().any()])
    imputations_df = pd.DataFrame()
    for i in missing_cols:
        new_column = pd.Series(train_copy[i].median(), name = i)
        imputations_df = pd.concat([imputations_df, new_column], axis = 1)
    
    for i in missing_cols:
        train_copy.loc[train_copy[i].isna(), i] = imputations_df[i].item()
        x_test_copy.loc[x_test_copy[i].isna(), i] = imputations_df[i].item()
        
    # Normalization
    remove = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade', 'term']
    numerical_columns = list(train_copy.select_dtypes(include = ['float64', 'int64']).columns)
    numerical_columns = [x for x in numerical_columns if x not in remove]

    scaler = MinMaxScaler()
    train_copy[numerical_columns] = scaler.fit_transform(train_copy[numerical_columns])
    x_test_copy[numerical_columns] = scaler.transform(x_test_copy[numerical_columns])

    # One hot encoding
    categorical_columns = list(train_copy.select_dtypes(include = ['object']).columns)
    categorical_columns = [x for x in categorical_columns if x not in remove]
    categorical_columns.remove('initial_list_status')
    
    transformers = [
    ('', OneHotEncoder(handle_unknown = 'ignore', sparse = False), categorical_columns)
    ]
    ct = ColumnTransformer(transformers)
    
    train_copy_transformed = ct.fit_transform(train_copy)
    feature_names = ct.get_feature_names_out()  
    train_copy_transformed = pd.DataFrame(train_copy_transformed, columns = feature_names).set_index(train_copy.index)
    train_copy_transformed.columns = train_copy_transformed.columns.str[2:]
    
    x_test_copy_transformed = ct.transform(x_test_copy)
    x_test_copy_transformed = pd.DataFrame(x_test_copy_transformed, columns = feature_names).set_index(x_test_copy.index)
    x_test_copy_transformed.columns = x_test_copy_transformed.columns.str[2:]    
    
    # Concatenate the final dataframe
    train_f = pd.concat([train_copy[remove + numerical_columns], train_copy_transformed], axis = 1)
    test_f = pd.concat([x_test_copy[remove + numerical_columns], x_test_copy_transformed], axis = 1)
    
    return train_f, test_f

In [305]:
def create_transition_matrix(year1_train, year1_test, both_years_train, both_years_test, year1, year2):
    modeling_columns_year1 = list(year1_train.columns)[6:]
    modeling_columns_both_years  = list(both_years_train)[6:]
    
    # Build models
    xgb_clf_1 = xgb.XGBClassifier(
        n_estimators = 99,
        max_depth = 5,
        gamma = 0.00011724653799472996,
        reg_alpha = 0.9735500996403061, 
        reg_lambda = 1.881692086718138e-07,
        subsample = 0.9550961565694804,
        colsample_bytree = 0.5414369781235469,
        verbosity = 0,
        objective = 'binary:logistic',
        booster = 'gbtree',
        random_state = 7,
        n_jobs = -1
    )

    xgb_clf_2 = xgb.XGBClassifier(
        n_estimators = 99,
        max_depth = 5,
        gamma = 0.00011724653799472996,
        reg_alpha = 0.9735500996403061, 
        reg_lambda = 1.881692086718138e-07,
        subsample = 0.9550961565694804,
        colsample_bytree = 0.5414369781235469,
        verbosity = 0,
        objective = 'binary:logistic',
        booster = 'gbtree',
        random_state = 7,
        n_jobs = -1
    )

    # Fit models
    xgb_clf_1.fit(year1_train[modeling_columns_year1], year1_train['loan_status'],
                eval_metric = 'auc',
                verbose = False
    )

    xgb_clf_2.fit(both_years_train[modeling_columns_both_years], both_years_train['loan_status'],
                eval_metric = 'auc',
                verbose = False
    )

    # Make predictions
    pred_year1 = xgb_clf_1.predict_proba(year1_test[modeling_columns_year1])[:, 1]
    pred_both_years = xgb_clf_2.predict_proba(both_years_test[modeling_columns_both_years])[:, 1]
    
    # Create the transition matrix
    def grader(year):
        if year >= 0 and year <= 0.20:
            grade = 'A' 
        elif year > 0.20 and year <= 0.40:
            grade = 'B' 
        elif year > 0.40 and year <= 0.60:
            grade = 'C' 
        elif year > 0.60 and year <= 0.80:
            grade = 'D' 
        else:
            grade = 'E'
        
        return grade
    
    transition_matrix = year1_test[['loan_status', 'id']]
    transition_matrix = pd.concat([transition_matrix, pd.Series(pred_year1, index = year1_test.index, name = year1)], axis = 1)
    transition_matrix[f"{year1}_grade"] = transition_matrix.apply(lambda row: grader(row[year1]), axis = 1)
    transition_matrix = pd.concat([transition_matrix, pd.Series(pred_both_years, index = year1_test.index, name = f"{year1}_{year2}")], axis = 1)
    transition_matrix[f"{year1}_{year2}_grade"] = transition_matrix.apply(lambda row: grader(row[f"{year1}_{year2}"]), axis = 1)

    transition_matrix_df = pd.DataFrame()
    for i in ['A', 'B', 'C', 'D', 'E']:
        temp = transition_matrix.loc[transition_matrix[f"{year1}_grade"] == i]
        transition_matrix_df = transition_matrix_df.append(temp[f"{year1}_{year2}_grade"].value_counts(normalize = True))
    transition_matrix_df = transition_matrix_df.fillna(0)
    transition_matrix_df.index = ['A', 'B', 'C', 'D', 'E']
    
    return transition_matrix_df

In [308]:
def transition_matrix(df, year1, year2):
    year1_balanced, both_years_balanced, test = transition_df_prep(df, year1, year2)
    year1_balanced_preprocessed, year1_test = transition_df_pipeline(year1_balanced, test)
    both_years_balanced_preprocessed, both_years_test = transition_df_pipeline(both_years_balanced, test)
    t_matrix = create_transition_matrix(year1_balanced_preprocessed, year1_test, both_years_balanced_preprocessed, both_years_test, str(year1), str(year2))
    return t_matrix

----
# Create Transition Matrices

In [318]:
years = [2013, 2014, 2015, 2016, 2017, 2018, 2019]
for i in years[:-1]:
    print(f"{i} - {i+1} Transition Matrix")
    display(transition_matrix(df_historical_clean, i, i+1))

2013 - 2014 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.624773,0.3498,0.023611,0.001816,0.0
B,0.085888,0.625791,0.273601,0.014355,0.000365
C,0.004928,0.198664,0.632461,0.160881,0.003066
D,0.001347,0.028788,0.321044,0.620202,0.02862
E,0.001088,0.004353,0.069641,0.722524,0.202394


2014 - 2015 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.703417,0.284378,0.011595,0.00061,0.0
B,0.078302,0.679606,0.233202,0.008667,0.000222
C,0.002901,0.147741,0.68841,0.159405,0.001543
D,0.000354,0.008316,0.217553,0.721048,0.052729
E,0.0,0.0,0.024958,0.507488,0.467554


2015 - 2016 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.760514,0.235397,0.003699,0.000389,0.0
B,0.060256,0.762925,0.174054,0.002765,0.0
C,0.001165,0.130168,0.746186,0.121783,0.000699
D,0.00016,0.004491,0.176486,0.76358,0.055282
E,0.0,0.000985,0.009189,0.339678,0.650148


2016 - 2017 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.782187,0.210259,0.007554,0.0,0.0
B,0.080042,0.762434,0.153806,0.003638,8e-05
C,0.001267,0.149432,0.744215,0.10423,0.000857
D,6.1e-05,0.004781,0.21232,0.743549,0.039289
E,0.0,0.000279,0.005294,0.405405,0.589022


2017 - 2018 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.778075,0.21565,0.006009,0.000267,0.0
B,0.090734,0.732216,0.172349,0.004531,0.00017
C,0.001734,0.169167,0.705822,0.122434,0.000844
D,7.2e-05,0.007553,0.222774,0.725651,0.043951
E,0.0,0.000802,0.010429,0.437625,0.551143


2018 - 2019 Transition Matrix


Unnamed: 0,A,B,C,D,E
A,0.702377,0.283698,0.013554,0.000371,0.0
B,0.100802,0.665867,0.220695,0.012358,0.000277
C,0.003972,0.151911,0.649346,0.190717,0.004054
D,0.000435,0.011841,0.206192,0.703422,0.07811
E,0.0,0.00261,0.014614,0.367432,0.615344
