# Models

In [384]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import math
import datetime

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.metrics import r2_score
from sklearn.preprocessing import Imputer

from sklearn.linear_model import LassoCV
from sklearn.linear_model import RidgeCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier

import statsmodels.api as sm
from statsmodels.api import OLS

In [288]:
def display_df(df, nrows=5, ncols=None):
    with pd.option_context('display.max_rows', nrows, 'display.max_columns', ncols):
        display (df)
#gets ratio of NaNs for each column
def stats_NaN(df):
    df_stats = pd.DataFrame(index=[df.columns], columns=["NaN Ratio"])
    for col in df.columns:
        df_stats["NaN Ratio"][col] = df[col].isna().sum()/len(df) #NaN ratio
    return df_stats.sort_values(by=['NaN Ratio'])

In [102]:
df_whole = pd.read_csv("../data/data_clean/clean_accepted_2007_to_2018Q2.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [104]:
stats_nan = stats_NaN(df_whole)

In [105]:
display_df(stats_nan, None)

Unnamed: 0,NaN Ratio
addr_state_DC,0.0
purpose_debt_consolidation,0.0
purpose_educational,0.0
purpose_home_improvement,0.0
purpose_house,0.0
purpose_major_purchase,0.0
purpose_medical,0.0
purpose_moving,0.0
purpose_other,0.0
purpose_credit_card,0.0


In [264]:
our_drop_list = ['funded_amnt','funded_amnt_inv','int_rate','installment','grade',
                 'pymnt_plan','zip_code','initial_list_status','out_prncp', 'application_type', 'policy_code',
                 'out_prncp_inv','total_pymnt','total_pymnt_inv','total_rec_prncp','total_rec_int',
                 'total_rec_late_fee','recoveries','collection_recovery_fee','last_pymnt_d','last_pymnt_amnt',
                 'next_pymnt_d','last_credit_pull_d','last_fico_range_high','last_fico_range_low',
                 'collections_12_mths_ex_med','mths_since_last_major_derog','acc_now_delinq','tot_coll_amt',
                 'tot_cur_bal','open_acc_6m','open_act_il','open_il_12m','open_il_24m','mths_since_rcnt_il',
                 'total_bal_il','il_util','open_rv_12m','open_rv_24m','max_bal_bc','all_util','total_rev_hi_lim',
                 'inq_fi','total_cu_tl','inq_last_12m','acc_open_past_24mths','avg_cur_bal','bc_open_to_buy',
                 'bc_util','chargeoff_within_12_mths','delinq_amnt','mo_sin_old_il_acct','mo_sin_old_rev_tl_op',
                 'mo_sin_rcnt_rev_tl_op','mo_sin_rcnt_tl','mort_acc','mths_since_recent_bc',
                 'mths_since_recent_bc_dlq','mths_since_recent_inq','mths_since_recent_revol_delinq',
                 'num_accts_ever_120_pd','num_actv_bc_tl','num_actv_rev_tl','num_bc_sats','num_bc_tl',
                 'num_il_tl','num_op_rev_tl','num_rev_accts','num_rev_tl_bal_gt_0','num_sats','num_tl_120dpd_2m',
                 'num_tl_30dpd','num_tl_90g_dpd_24m','num_tl_op_past_12m','pct_tl_nvr_dlq','percent_bc_gt_75',
                 'pub_rec_bankruptcies','tax_liens','tot_hi_cred_lim','total_bal_ex_mort','total_bc_limit',
                 'total_il_high_credit_limit','revol_bal_joint','sec_app_fico_range_high',
                 'sec_app_earliest_cr_line','sec_app_inq_last_6mths','sec_app_mort_acc','sec_app_open_acc',
                 'sec_app_revol_util','sec_app_open_act_il','sec_app_num_rev_accts',
                 'sec_app_chargeoff_within_12_mths','sec_app_collections_12_mths_ex_med',
                 'sec_app_mths_since_last_major_derog','hardship_flag','hardship_type','hardship_reason',
                 'hardship_status','deferral_term','hardship_amount','hardship_start_date','hardship_end_date',
                 'payment_plan_start_date','hardship_length','hardship_dpd','hardship_loan_status',
                 'orig_projected_additional_accrued_interest','hardship_payoff_balance_amount',
                 'hardship_last_payment_amount','disbursement_method','debt_settlement_flag',
                 'debt_settlement_flag_date','settlement_status','settlement_date','settlement_amount',
                 'settlement_percentage','settlement_term']

In [265]:
df_less_feats = df_whole.drop(columns=our_drop_list)

In [266]:
display_df(df_less_feats)

Unnamed: 0,loan_amnt,term,sub_grade,emp_length,annual_inc,issue_d,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,annual_inc_joint,dti_joint,verification_status_joint,sec_app_fico_range_low,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,loan_status_Charged Off,loan_status_Current,loan_status_Default,loan_status_Does not meet the credit policy. Status:Charged Off,loan_status_Does not meet the credit policy. Status:Fully Paid,loan_status_Fully Paid,loan_status_In Grace Period,loan_status_Late (16-30 days),loan_status_Late (31-120 days),purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,addr_state_AK,addr_state_AL,addr_state_AR,addr_state_AZ,addr_state_CA,addr_state_CO,addr_state_CT,addr_state_DC,addr_state_DE,addr_state_FL,addr_state_GA,addr_state_HI,addr_state_IA,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,num_grade
0,15000.0,1.0,C1,10.0,78000.0,2014-12-01,0.1203,0.0,1994-08-01,750.0,754.0,0.0,800.0,800.0,6.0,0.0,138008.0,29.0,17.0,,,,,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0
1,10400.0,0.0,A3,8.0,58000.0,2014-12-01,0.1492,0.0,1989-09-01,710.0,714.0,2.0,42.0,800.0,17.0,0.0,6133.0,31.6,36.0,,,,,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004087,12000.0,0.0,B3,8.0,36000.0,2018-01-01,0.1110,1.0,1998-05-01,685.0,689.0,0.0,21.0,800.0,14.0,0.0,11648.0,43.6,18.0,,,,,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.4
2004088,14000.0,0.0,C2,2.0,80000.0,2018-01-01,0.0135,0.0,2007-07-01,660.0,664.0,1.0,31.0,800.0,11.0,0.0,1461.0,4.1,21.0,,,,,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.2


## Data Augmentation

There might be some value in having a secondary applicant. We will create a new variable from a check on `sec_app_fico_range_low` to detect a secondary applicant. We will also drop the other high NaN proportion features.

In [268]:
def add_secondary(df, label='sec_app_fico_range_low'):
    df['secondary'] = df[label].apply(lambda x: int(not pd.isnull(x)))

In [269]:
add_secondary(df_less_feats)

In [270]:
drop_high_nan_feats = ['annual_inc_joint', 'dti_joint','verification_status_joint','sec_app_fico_range_low']
df_less_feats.drop(columns=drop_high_nan_feats, inplace=True)

In [271]:
display_df(df_less_feats)

Unnamed: 0,loan_amnt,term,sub_grade,emp_length,annual_inc,issue_d,dti,delinq_2yrs,earliest_cr_line,fico_range_low,fico_range_high,inq_last_6mths,mths_since_last_delinq,mths_since_last_record,open_acc,pub_rec,revol_bal,revol_util,total_acc,home_ownership_ANY,home_ownership_MORTGAGE,home_ownership_NONE,home_ownership_OTHER,home_ownership_OWN,home_ownership_RENT,verification_status_Not Verified,verification_status_Source Verified,verification_status_Verified,loan_status_Charged Off,loan_status_Current,loan_status_Default,loan_status_Does not meet the credit policy. Status:Charged Off,loan_status_Does not meet the credit policy. Status:Fully Paid,loan_status_Fully Paid,loan_status_In Grace Period,loan_status_Late (16-30 days),loan_status_Late (31-120 days),purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_educational,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,addr_state_AK,addr_state_AL,addr_state_AR,addr_state_AZ,addr_state_CA,addr_state_CO,addr_state_CT,addr_state_DC,addr_state_DE,addr_state_FL,addr_state_GA,addr_state_HI,addr_state_IA,addr_state_ID,addr_state_IL,addr_state_IN,addr_state_KS,addr_state_KY,addr_state_LA,addr_state_MA,addr_state_MD,addr_state_ME,addr_state_MI,addr_state_MN,addr_state_MO,addr_state_MS,addr_state_MT,addr_state_NC,addr_state_ND,addr_state_NE,addr_state_NH,addr_state_NJ,addr_state_NM,addr_state_NV,addr_state_NY,addr_state_OH,addr_state_OK,addr_state_OR,addr_state_PA,addr_state_RI,addr_state_SC,addr_state_SD,addr_state_TN,addr_state_TX,addr_state_UT,addr_state_VA,addr_state_VT,addr_state_WA,addr_state_WI,addr_state_WV,addr_state_WY,num_grade,secondary
0,15000.0,1.0,C1,10.0,78000.0,2014-12-01,0.1203,0.0,1994-08-01,750.0,754.0,0.0,800.0,800.0,6.0,0.0,138008.0,29.0,17.0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,2.0,0
1,10400.0,0.0,A3,8.0,58000.0,2014-12-01,0.1492,0.0,1989-09-01,710.0,714.0,2.0,42.0,800.0,17.0,0.0,6133.0,31.6,36.0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2004087,12000.0,0.0,B3,8.0,36000.0,2018-01-01,0.1110,1.0,1998-05-01,685.0,689.0,0.0,21.0,800.0,14.0,0.0,11648.0,43.6,18.0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.4,0
2004088,14000.0,0.0,C2,2.0,80000.0,2018-01-01,0.0135,0.0,2007-07-01,660.0,664.0,1.0,31.0,800.0,11.0,0.0,1461.0,4.1,21.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.2,0


As oppose to a range for FICO scores, we think it would be better to model on the average.

In [272]:
df_less_feats['fico_avg'] = df_less_feats[['fico_range_low', 'fico_range_high']].mean(axis=1)
df_less_feats.drop(columns=['fico_range_low', 'fico_range_high'], inplace=True)

`earliest_cr_line` may not be itself a useful feature (a date with no context). However it can tell us how long a person has had a credit line open, which is probably useful.

In [273]:
def timedelta_to_day(t):
    if not pd.isnull(t):
        if isinstance(t, int):
            return t
        else:
            return t.days
    else:
        return np.nan

In [274]:
df_less_feats['issue_d'] = pd.to_datetime(df_less_feats['issue_d'])
df_less_feats['earliest_cr_line'] = pd.to_datetime(df_less_feats['earliest_cr_line'])
df_less_feats['cr_line_hist'] = df_less_feats['issue_d'] - pd.to_datetime(df_less_feats['earliest_cr_line'])
df_less_feats['cr_line_hist'] = df_less_feats['cr_line_hist'].apply(timedelta_to_day)

In [275]:
df_less_feats.drop(columns=['earliest_cr_line'],inplace=True)

## Data Prep

We need to deal with NaNs. We can drop the samples for NaN features when the NaN ratio of that feature is small. For the rest, which is just `emp_length`, we'll do mean imputation to keep things simple.

In [276]:
stats_nan_less = stats_NaN(df_less_feats)

In [277]:
display_df(stats_nan_less,None)

Unnamed: 0,NaN Ratio
addr_state_AZ,0.0
addr_state_IL,0.0
addr_state_ID,0.0
addr_state_IA,0.0
addr_state_HI,0.0
addr_state_GA,0.0
addr_state_FL,0.0
addr_state_DE,0.0
addr_state_DC,0.0
addr_state_IN,0.0


In [278]:
df_less_feats.dropna(subset=['issue_d','annual_inc','delinq_2yrs','total_acc','open_acc','pub_rec','cr_line_hist',
               'inq_last_6mths','dti','revol_util'],inplace=True)

In [279]:
stats_nan_less = stats_NaN(df_less_feats)
display_df(stats_nan_less,None)

Unnamed: 0,NaN Ratio
loan_amnt,0.0
addr_state_MT,0.0
addr_state_MS,0.0
addr_state_MO,0.0
addr_state_MN,0.0
addr_state_MI,0.0
addr_state_ME,0.0
addr_state_MD,0.0
addr_state_MA,0.0
addr_state_LA,0.0


Most models can't handle datetime objects. So we will convert this to a float.

In [280]:
df_less_feats['issue_d']=df_less_feats['issue_d'].map(datetime.datetime.toordinal)

In [310]:
target = 'num_grade'
target_class = 'sub_grade'
df_train, df_test = train_test_split(df_less_feats, random_state=9001, test_size=0.2, 
                                     stratify=df_less_feats[target_class])

In [311]:
cols_to_imp = ['emp_length']

In [312]:
imp_mean = Imputer(copy=True, missing_values=np.nan, strategy='mean').fit(df_train[cols_to_imp]) #fit to training data
df_train[cols_to_imp] = imp_mean.transform(df_train[cols_to_imp])
df_test[cols_to_imp] = imp_mean.transform(df_test[cols_to_imp])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [313]:
display_df(stats_NaN(df_train),None) #should be all zeros

Unnamed: 0,NaN Ratio
loan_amnt,0
addr_state_MT,0
addr_state_MS,0
addr_state_MO,0
addr_state_MN,0
addr_state_MI,0
addr_state_ME,0
addr_state_MD,0
addr_state_MA,0
addr_state_LA,0


In [315]:
x_cols = list(set([target,target_class]).symmetric_difference(list(df_train.columns)))
x_train, x_test = df_train[x_cols], df_test[x_cols]
y_train, y_test = df_train[target], df_test[target]
y_train_class, y_test_class = df_train[target_class], df_test[target_class]

Now we can standardize the data.

In [316]:
def standardize(x, x_ref, labels):
    std = np.std(x_ref[labels])
    mean = np.mean(x_ref[labels])
    x_std = (x[labels] - mean)/std
    return x_std

In [317]:
std_labels = ['loan_amnt','total_acc','revol_util','revol_bal','pub_rec','open_acc','mths_since_last_record',
             'mths_since_last_delinq','inq_last_6mths','delinq_2yrs','dti','annual_inc','fico_avg',
             'cr_line_hist','emp_length','issue_d']
x_train_unstand = x_train.copy()
x_train_std = x_train.copy()
x_train_std[std_labels] = standardize(x_train, x_train_unstand, std_labels)

x_test_unstand = x_test.copy()
x_test_std = x_test.copy()
x_test_std[std_labels] = standardize(x_test, x_train_unstand, std_labels)

In [318]:
display_df(x_test_std, 10)

Unnamed: 0,home_ownership_NONE,home_ownership_OWN,addr_state_VT,addr_state_NJ,issue_d,verification_status_Not Verified,loan_status_Late (31-120 days),purpose_home_improvement,addr_state_CT,addr_state_AR,loan_status_Current,addr_state_OK,loan_status_Charged Off,loan_status_Does not meet the credit policy. Status:Charged Off,addr_state_NE,addr_state_RI,addr_state_MI,addr_state_CA,addr_state_MS,pub_rec,addr_state_KY,annual_inc,delinq_2yrs,loan_status_Fully Paid,addr_state_IA,open_acc,addr_state_WI,emp_length,purpose_educational,term,loan_amnt,dti,addr_state_DE,addr_state_HI,addr_state_NM,addr_state_KS,addr_state_IL,mths_since_last_record,addr_state_PA,revol_bal,purpose_credit_card,addr_state_WA,addr_state_OH,revol_util,addr_state_TN,secondary,home_ownership_OTHER,addr_state_SC,loan_status_Does not meet the credit policy. Status:Fully Paid,inq_last_6mths,addr_state_ID,loan_status_Late (16-30 days),verification_status_Verified,addr_state_WV,purpose_moving,cr_line_hist,fico_avg,addr_state_VA,addr_state_GA,addr_state_AL,addr_state_MN,home_ownership_ANY,addr_state_AK,purpose_debt_consolidation,purpose_medical,addr_state_FL,addr_state_LA,home_ownership_MORTGAGE,addr_state_NC,loan_status_In Grace Period,addr_state_ME,purpose_car,addr_state_IN,addr_state_MA,loan_status_Default,addr_state_MT,addr_state_DC,addr_state_OR,addr_state_ND,purpose_major_purchase,addr_state_AZ,total_acc,addr_state_MD,purpose_vacation,addr_state_UT,addr_state_WY,mths_since_last_delinq,addr_state_NV,purpose_house,addr_state_TX,purpose_renewable_energy,addr_state_SD,verification_status_Source Verified,addr_state_NH,purpose_wedding,addr_state_MO,addr_state_CO,purpose_small_business,home_ownership_RENT,addr_state_NY,purpose_other
1955928,0,0,0,0,1.218230,1,0,0,1,0,1,0,0,0,0,0,0,0,0,-0.350171,0,-0.029497,-0.359691,0,0,-0.648512,0,-2.863524e-01,0,0.0,-0.543836,-0.473508,0,0,0,0,0,0.441931,0,0.038388,0,0,0,1.450860,0,0,0,0,0,-0.658735,0,0,0,0,0,-1.418433,-0.695487,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1.369610,0,0,0,0,0.986977,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1033990,0,1,0,0,0.625655,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.350171,0,0.079481,0.774675,0,0,0.066341,0,-1.309037e-12,0,1.0,1.179385,0.210531,0,0,0,0,0,0.441931,0,0.715214,0,0,0,0.751638,0,0,0,0,0,0.446714,0,0,1,0,0,2.242055,-0.695487,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0.973425,0,0,0,0,-1.046213,0,0,0,0,0,0,0,0,0,0,0,0,0,0
668234,0,0,0,0,-0.611449,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.350171,0,-0.201714,-0.359691,0,0,-1.542078,0,1.131624e+00,0,0.0,0.229404,-1.024518,0,0,0,0,0,0.441931,0,-0.406442,0,0,0,1.893972,0,0,0,0,0,-0.658735,0,0,1,0,0,1.598929,-0.848969,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-1.034891,0,0,0,0,-0.913103,0,0,1,0,0,0,0,0,0,0,0,0,0,0
361666,0,0,0,1,-0.117906,0,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.350171,0,-0.126398,0.774675,0,0,-0.827225,0,-1.562531e+00,0,1.0,0.008479,-0.481066,0,0,0,0,0,0.441931,0,-0.155544,0,0,0,0.109328,0,0,0,0,0,-0.658735,0,0,1,0,0,2.121804,-0.235039,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.114413,0,0,0,0,-1.082753,0,0,0,0,0,0,0,0,0,0,0,0,0,0
972280,0,0,0,0,-2.095323,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.350171,0,-0.029497,-0.359691,1,0,-0.112372,0,5.644333e-01,0,1.0,0.737534,-0.495427,0,0,0,0,1,0.441931,0,-0.078982,0,0,0,-0.016695,0,0,0,0,0,0.446714,0,0,1,0,0,-0.079571,0.225409,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.616492,0,0,0,0,0.986977,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677515,0,0,0,0,1.117574,1,0,0,0,0,1,0,0,0,0,0,0,0,0,-0.350171,0,-0.144308,-0.359691,0,0,-0.291085,0,-1.137138e+00,0,0.0,-1.040919,0.007209,0,0,0,0,0,0.441931,0,-0.387453,0,0,0,-1.260661,0,0,0,0,0,-0.658735,0,0,0,0,0,-0.863706,0.839340,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.030733,0,0,0,0,0.986977,0,0,0,0,0,0,0,0,0,0,0,1,0,0
324922,0,0,0,0,-0.117906,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.341702,0,-0.109865,-0.359691,1,0,-0.112372,0,5.644333e-01,0,0.0,-0.543836,-0.502986,0,0,0,0,0,-2.285250,0,-0.471420,0,0,0,-0.195565,0,0,0,0,0,-0.658735,0,0,0,0,0,-0.765286,-1.002452,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0.387666,0,0,0,0,-0.889613,0,0,0,0,0,1,0,0,0,0,0,0,0,1
36925,0,0,0,0,-0.760810,0,0,0,0,0,0,0,1,0,0,0,1,0,0,-0.350171,0,-0.086903,0.774675,0,0,-0.827225,0,1.131624e+00,0,0.0,1.113107,-0.485601,0,0,0,0,0,0.441931,0,0.116519,0,0,0,-0.089869,0,0,0,0,0,-0.658735,0,0,0,0,0,1.718463,-0.695487,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.114413,0,0,0,0,-1.067093,0,0,0,0,0,1,0,0,0,0,0,0,0,0
1417463,0,0,0,0,0.869179,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1.341702,0,-0.660959,-0.359691,0,0,-0.827225,0,-1.309037e-12,0,0.0,-1.096150,0.850730,0,0,0,0,0,-2.222259,0,-0.611697,0,0,1,0.044284,0,0,0,0,0,0.446714,0,0,1,0,0,0.007396,-1.155934,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-0.867531,0,0,0,0,-1.033163,0,0,0,0,0,0,0,0,0,0,0,1,0,0


The regression to classification function:

In [None]:
def myround(x, prec=2, base=0.2):
    return round(base * round(float(x)/base),prec)
    
def num_to_subgrade(num):
    if not pd.isnull(num):
        sub = round((myround(num) - math.floor(num))/0.2)+1
        letter = {
            0:'A',
            1:'B',
            2:'C',
            3:'D',
            4:'E',
            5:'F',
            6:'G',
        }.get(math.floor(num))
        if letter == None:
            if num<0:
                letter = 'A' #if negative num
            elif num>6:
                letter = 'H' #catch class
        return letter+str(sub)
    else:
        return np.nan

## Models

### Baseline Model - Linear Regression (OLS)

In [320]:
X_train = sm.add_constant(x_train)
X_test = sm.add_constant(x_test)
model_OLS = OLS(y_train, X_train.astype(float)).fit()

In [321]:
OLS_train_pred = model_OLS.predict(X_train)
OLS_test_pred = model_OLS.predict(X_test)

In [347]:
score_OLS_train = r2_score(y_train,OLS_train_pred)
score_OLS_test = r2_score(y_test, OLS_test_pred)
acc_OLS_train = accuracy_score(y_train_class,OLS_train_pred.apply(num_to_subgrade))
acc_OLS_test = accuracy_score(y_test_class, OLS_test_pred.apply(num_to_subgrade))
print('OLS regression score on the training set is %.6f'%score_OLS_train)
print('OLS regression score on the test set is %.6f'%score_OLS_test)
print('OLS classification accuracy on the training set is %.6f'%acc_OLS_train)
print('OLS classification accuracy on the test set is %.6f'%acc_OLS_test)

OLS regression score on the training set is 0.484824
OLS regression score on the test set is 0.479960
OLS classification accuracy on the training set is 0.081235
OLS classification accuracy on the test set is 0.081606


In [260]:
mpl.rcParams['agg.path.chunksize'] = 10000
plt.scatter(OLS_test_pred)

TypeError: scatter() missing 1 required positional argument: 'y'

### Regularization Linear Models - Lasso and Ridge

In [323]:
model_Lasso = LassoCV(cv=5, random_state=9001).fit(x_train,y_train)

In [327]:
Lasso_train_pred = model_Lasso.predict(x_train)
Lasso_test_pred = model_Lasso.predict(x_test)

In [341]:
score_Lasso_train = model_Lasso.score(x_train,y_train)
score_Lasso_test = model_Lasso.score(x_test,y_test)
acc_Lasso_train = accuracy_score(y_train_class,[num_to_subgrade(x) for x in Lasso_train_pred])
acc_Lasso_test = accuracy_score(y_test_class,[num_to_subgrade(x) for x in Lasso_test_pred])

In [349]:
print('Lasso regression score on the training set is %.6f'%score_Lasso_train)
print('Lasso regression score on the test set is %.6f'%score_Lasso_test)
print('Lasso classification accuracy on the training set with Lasso is %.6f'%acc_Lasso_train)
print('Lasso classification accuracy on the test set is %.6f'%acc_Lasso_test)

Lasso regression score on the training set is 0.208287
Lasso regression score on the test set is 0.186019
Lasso classification accuracy on the training set with Lasso is 0.063976
Lasso classification accuracy on the test set is 0.063164


In [351]:
model_Ridge = RidgeCV(cv=5).fit(x_train,y_train)

Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number1.346162e-17
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number8.901389e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number8.772169e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number8.738521e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number8.559405e-18
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number8.896042e-17
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not guaranteed to be accurate.
Reciprocal condition number8.771335e-17
  overwrite_a=True).T
Ill-conditioned matrix detected. Result is not g

In [352]:
Ridge_train_pred = model_Ridge.predict(x_train)
Ridge_test_pred = model_Ridge.predict(x_test)

In [353]:
score_Ridge_train = model_Ridge.score(x_train,y_train)
score_Ridge_test = model_Ridge.score(x_test,y_test)
acc_Ridge_train = accuracy_score(y_train_class,[num_to_subgrade(x) for x in Ridge_train_pred])
acc_Ridge_test = accuracy_score(y_test_class,[num_to_subgrade(x) for x in Ridge_test_pred])

In [354]:
print('Ridge regression score on the training set is %.6f'%score_Ridge_train)
print('Ridge regression score on the test set is %.6f'%score_Ridge_test)
print('Ridge classification accuracy on the training set with Ridge is %.6f'%acc_Ridge_train)
print('Ridge classification accuracy on the test set is %.6f'%acc_Ridge_test)

Ridge regression score on the training set is 0.484824
Ridge regression score on the test set is 0.479960
Ridge classification accuracy on the training set with Ridge is 0.081235
Ridge classification accuracy on the test set is 0.081606


### Decision Tree

In [371]:
depths = {'max_depth' : [5,10,15,20,25]}

In [372]:
model_DT_grid  = GridSearchCV(DecisionTreeClassifier(), depths, cv = 5)
model_DT_grid.fit(x_train, y_train_class)

model_DT = DecisionTreeClassifier(max_depth=model_DT_grid.best_params_['max_depth'])
model_DT.fit(x_train, y_train_class)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=15,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [374]:
print("Best DT had max_depth: %i"%model_DT_grid.best_params_['max_depth'])
acc_model_DT_train = model_DT.score(x_train, y_train_class)
print('Accuracy on training set for Decision Tree classifier is %f' %acc_model_DT_train)
acc_model_DT_test = model_DT.score(x_test, y_test_class)
print('Accuracy on test set for Decision Tree classifier is %f' %acc_model_DT_test)

model_DT_sig_feats = np.argsort(model_DT.feature_importances_)[::-1]
print('\nSignificant predictors are: ')

for i in range(10):
    print(x_test.columns[model_DT_sig_feats[i]])

print('\nTotal Number of Predictors is %d' %np.sum(model_DT.feature_importances_> 0))

Best DT had max_depth: 15
Accuracy on training set for Decision Tree classifier is 0.199871
Accuracy on test set for Decision Tree classifier is 0.145280

Significant predictors are: 
fico_avg
issue_d
loan_amnt
term
revol_util
dti
annual_inc
cr_line_hist
revol_bal
open_acc

Total Number of Predictors is 98


In [379]:
model_DTR_grid  = GridSearchCV(DecisionTreeRegressor(), depths, cv = 5)
model_DTR_grid.fit(x_train, y_train)

model_DTR = DecisionTreeRegressor(max_depth=model_DTR_grid.best_params_['max_depth'])
model_DTR.fit(x_train, y_train)

DecisionTreeRegressor(criterion='mse', max_depth=15, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=None, splitter='best')

In [382]:
print("Best DTR had max_depth: %i"%model_DTR_grid.best_params_['max_depth'])
DTR_train_pred = model_DTR.predict(x_train)
acc_model_DTR_train = accuracy_score(y_train_class,[num_to_subgrade(x) for x in DTR_train_pred])
print('Accuracy on training set for Decision Tree classifier is %f' %acc_model_DTR_train)
DTR_test_pred = model_DTR.predict(x_test)
acc_model_DTR_test = accuracy_score(y_test_class,[num_to_subgrade(x) for x in DTR_test_pred])
print('Accuracy on test set for Decision Tree classifier is %f' %acc_model_DTR_test)

model_DTR_sig_feats = np.argsort(model_DTR.feature_importances_)[::-1]
print('\nSignificant predictors are: ')

for i in range(10):
    print(x_test.columns[model_DTR_sig_feats[i]])

print('\nTotal Number of Predictors is %d' %np.sum(model_DTR.feature_importances_> 0))

Best DTR had max_depth: 15
Accuracy on training set for Decision Tree classifier is 0.102237
Accuracy on test set for Decision Tree classifier is 0.090330

Significant predictors are: 
fico_avg
term
dti
inq_last_6mths
issue_d
annual_inc
purpose_credit_card
loan_amnt
loan_status_Current
revol_util

Total Number of Predictors is 97


### Random Forest

In [None]:
params = {'max_depth' : [10,12,15,17,20], 'n_estimators': [15,30,40,45,50]}

In [None]:
model_RF_grid  = GridSearchCV(RandomForestClassifier(), params, cv = 5)
model_RF_grid.fit(x_train, y_train_class)

In [None]:
model_RF = RandomForestClassifier(n_estimators=model_RF_grid.best_params_['n_estimators'],
                                  max_depth=model_RF_grid.best_params_['max_depth'])
model_RF.fit(x_train, y_train_class)

In [None]:
print("Best RF had max_depth: %i"%model_RF_grid.best_params_['max_depth'])
acc_model_RF_train = model_RF.score(x_train, y_train_class)
print('Accuracy on training set for Decision Tree classifier is %f' %acc_model_RF_train)
acc_model_RF_test = model_RF.score(x_test, y_test_class)
print('Accuracy on test set for Decision Tree classifier is %f' %acc_model_RF_test)

model_RF_sig_feats = np.argsort(model_RF.feature_importances_)[::-1]
print('\nSignificant predictors are: ')

for i in range(10):
    print(x_test.columns[model_RF_sig_feats[i]])

print('\nTotal Number of Predictors is %d' %np.sum(model_RF.feature_importances_> 0))