In [113]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')
import pickle

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.backend import clear_session

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score

In [12]:
df_historical_clean = pd.read_csv('/Users/vinh/FS/thesis/data/df_historical_clean.csv')
df_historical_clean.loc[df_historical_clean['term'] == 36, 'term'] = 0
df_historical_clean.loc[df_historical_clean['term'] == 60, 'term'] = 1
df_historical_clean.loc[df_historical_clean['loan_status'] == 'Fully Paid', 'loan_status'] = 0
df_historical_clean.loc[df_historical_clean['loan_status'] == 'Charged Off', 'loan_status'] = 1
year = pd.to_datetime(df_historical_clean['issue_d']).dt.year
df_historical_clean.insert(2, 'year', year)

In [23]:
loans_by_year = pd.DataFrame(df_historical_clean['year'].value_counts(normalize = True))
loans_by_year = loans_by_year.sort_index()

In [85]:
def transition_df_prep(df, year1, year2):
    df_year1 = df.loc[df['year'] == year1]
    df_both_years = df.loc[df['year'].isin([year1, year2])]
    
    X_year1 = df_year1.drop('loan_status', axis = 1)
    y_year1 = df_year1[['loan_status']]
    x_train_year1, x_test_year1, y_train_year1, y_test_year1 = train_test_split(X_year1, y_year1, test_size = 0.2, random_state = 1337, stratify = y_year1)
    temp = pd.concat([y_train_year1, x_train_year1], axis = 1)
    defaults_year1 = temp.loc[temp['loan_status'] == 1]
    completed_year1 = temp.loc[temp['loan_status'] == 0]
    defaults_train_year1, defaults_test_year1 = train_test_split(defaults_year1, test_size = 0.2, random_state = 1337)
    completed_train_year1, completed_test_year1 = train_test_split(completed_year1,
                                                                   test_size = defaults_train_year1.shape[0] / completed_year1.shape[0],
                                                                   random_state = 1337)
    year1_balanced = pd.concat([completed_test_year1, defaults_train_year1])

    X_both_years = df_both_years.drop('loan_status', axis = 1)
    y_both_years = df_both_years[['loan_status']]
    x_train_both_years, x_test_both_years, y_train_both_years, y_test_both_years = train_test_split(X_both_years, y_both_years, test_size = 0.2, random_state = 1337, stratify = y_both_years)
    temp2 = pd.concat([y_train_both_years, x_train_both_years], axis = 1)
    defaults_both_years = temp2.loc[temp2['loan_status'] == 1]
    completed_both_years = temp2.loc[temp2['loan_status'] == 0]
    defaults_train_both_years, defaults_test_both_years = train_test_split(defaults_both_years, test_size = 0.2, random_state = 1337)
    completed_train_both_years, completed_test_both_years = train_test_split(completed_both_years,
                                                                             test_size = defaults_train_both_years.shape[0] / completed_both_years.shape[0],
                                                                             random_state = 1337)
    both_years_balanced = pd.concat([completed_test_both_years, defaults_train_both_years])
    
    return year1_balanced, both_years_balanced, x_test_year1, y_test_year1

In [103]:
train_balanced_2013, train_balanced_2013_2014, x_test_2013, y_test_2013 = transition_df_prep(df_historical_clean, 2013, 2014)

In [114]:
def transition_df_pipeline(train, x_test):
    train_copy = train.copy()
    x_test_copy = x_test.copy()
    
    # Imputations
    missing_cols = list(train_copy.columns[train_copy.isnull().any()])
    imputations_df = pd.DataFrame()
    for i in missing_cols:
        new_column = pd.Series(train_copy[i].median(), name = i)
        imputations_df = pd.concat([imputations_df, new_column], axis = 1)
    
    for i in missing_cols:
        train_copy.loc[train_copy[i].isna(), i] = imputations_df[i].item()
        x_test_copy.loc[x_test_copy[i].isna(), i] = imputations_df[i].item()
        
    # Normalization
    remove = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade', 'term']
    numerical_columns = list(train_copy.select_dtypes(include = ['float64', 'int64']).columns)
    numerical_columns = [x for x in numerical_columns if x not in remove]

    scaler = MinMaxScaler()
    train_copy[numerical_columns] = scaler.fit_transform(train_copy[numerical_columns])
    x_test_copy[numerical_columns] = scaler.transform(x_test_copy[numerical_columns])

    # One hot encoding
    
    
    return train_copy, x_test_copy

In [115]:
a, b = transition_df_pipeline(train_balanced_2013, x_test_2013)

In [36]:
imputations_df = pd.DataFrame()
for i in list(df_2014.columns[df_2014.isnull().any()]):
    new_column = pd.Series([df_2014[i].median()], name = i)
    imputations_df = pd.concat([imputations_df, new_column], axis = 1)

In [38]:
imputations_df['emp_length'].item()

7.0

In [34]:
df_2014.columns[df_2014.isnull().any()]


Index(['emp_length', 'revol_util'], dtype='object')

In [None]:
def data_process_pipeline(df, test_set = False):
    

In [31]:
df_2014.isna().sum()

id                          0
issue_d                     0
year                        0
grade                       0
sub_grade                   0
loan_status                 0
loan_amnt                   0
term                        0
int_rate                    0
installment                 0
emp_length              12019
home_ownership              0
annual_inc                  0
verification_status         0
purpose                     0
addr_state                  0
dti                         0
earliest_cr_line            0
fico_range_low              0
open_acc                    0
pub_rec                     0
revol_bal                   0
revol_util                125
initial_list_status         0
application_type            0
mort_acc                    0
pub_rec_bankruptcies        0
dtype: int64

In [2]:
# Read in Data
to_drop = ['loan_status', 'id', 'issue_d', 'year', 'grade', 'sub_grade']

train_balanced = pd.read_csv('/Users/vinh/FS/thesis/data/train_balanced.csv')
x_train_balanced = train_balanced.drop(to_drop, axis = 1)
y_train_balanced = train_balanced[['loan_status']]

val_final = pd.read_csv('/Users/vinh/FS/thesis/data/val_final.csv')
x_val = val_final.drop(to_drop, axis = 1)
y_val = val_final[['loan_status']]

test_final = pd.read_csv('/Users/vinh/FS/thesis/data/test_final.csv')
x_test = test_final.drop(to_drop, axis = 1)
y_test = test_final[['loan_status']]

In [None]:
train_2