# Lending Club Data Cleaning Code

Part 1 of 3:
This code should take Lending Club data and clean it for the purpose of modeling, this includes caping and flooring, imputing, and eliminating irrelevant columns. At the end of the code there are reviews of the final output to make sure we have everything we need for future scoring (Part 3) and that the build data seems to make sense. Notebook developed by Tim Horan.

### Import necessary Python Packages

In [1]:
import pandas as pd
import time
import datetime
import numpy as np 
import json

from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook

### Create DF from downloaded CSVs

In [2]:
# Combine Historic Datasets from Lending Club (download from https://www.lendingclub.com/info/download-data.action)

dataSetA = 'LoanStats3a_securev1.csv' #2007-2011
dataSetB = 'LoanStats3b_securev1.csv' #2012-2013
dataSetC = 'LoanStats3c_securev1.csv' #2014
dataSetD = 'LoanStats3d_securev1.csv' #2015

#Define function and create dataframe from Lending Club raw CSVs
def importCSVLendingClub(dataset, headerRows, footerRows, index):
    """
    Input: Raw CSV of Lending Club historic performance
    Output: Data frames with specified index and appropriate headers and footers dropped
    """
    return pd.io.parsers.read_csv(dataset,sep = ',', index_col = index, skiprows = headerRows, skipfooter = footerRows,  engine='python')

#Execute above functions with index 'id'
df_a = importCSVLendingClub(dataSetA, 1, 2, 'id')
df_b = importCSVLendingClub(dataSetB, 1, 2, 'id')
df_c = importCSVLendingClub(dataSetC, 1, 2, 'id')
df_d = importCSVLendingClub(dataSetD, 1, 2, 'id')

#Combine 3 dataframes and drop duplicates and make sure there are no nulls of 'member_id' variable
df = pd.concat([df_a, df_b, df_c, df_d]).drop_duplicates()
df = df[df.index.notnull()]

### Add date data format and make all dates relative to appropriate date (mostly relative to issue date)

In [3]:
# Define function to trasform and compare two dates and output the number of months between time
def timeBetweenDatesM(timeZero, timeCompare):
    """
    Input: Two date fields (make sure field can be converted to a date using the "to_datetime" funciton before using)
    Output: A single numeric output that specifies the number of months between the two dates
    """
    timeZeroF = pd.to_datetime(timeZero,format="%b-%Y")
    timeCompareF = pd.to_datetime(timeCompare,format="%b-%Y")
    return (timeCompareF - timeZeroF).astype('timedelta64[M]')

# Add a 'today' column to leverage for relative timing (could also hardcode a date if desired)
# df['today'] = datetime.date.today().strftime("%b-%Y")
#buildMonth = datetime.date.today().strftime("%b-%Y")
#Hardcoded version of 'today'
df['today'] = 'Feb-2016'

# Apply definition to create realtive variables
df['last_pymnt_d_R'] = timeBetweenDatesM(df['issue_d'], df['last_pymnt_d'])
df['earliest_cr_line_R'] = timeBetweenDatesM(df['issue_d'], df['earliest_cr_line'])
df['last_pymnt_d_R'] = timeBetweenDatesM(df['issue_d'], df['last_pymnt_d'])
df['next_pymnt_d_R'] = timeBetweenDatesM(df['issue_d'], df['next_pymnt_d'])
df['last_credit_pull_d_R'] = timeBetweenDatesM(df['issue_d'], df['last_credit_pull_d'])
df['loan_age'] = timeBetweenDatesM(df['issue_d'], df['today'])

#Keep loans older than 18 months
df = df[df['loan_age'] >= 18]

### Review CO/Default Data and eliminate out of policy loans

In [4]:
# Create an inPolicy flag to track accounts in Policy
df['inPolicy'] = df['loan_status'].map(lambda i: 1 if i in [
'Current',
'Fully Paid',
'Charged Off',
'Late (31-120 days)',
'In Grace Period',
'Late (16-30 days)',
'Default'
] else 0)

#Drop out of policy loans
df = df[df.inPolicy == 1]

### Create and Add CO Flag and DQ/CO Flag

In [5]:
# Define charge off function 
def chargeOffFlag(loan_status, last_pymnt_d_R, n):
    """
    Input: Loan status whether they charged off or not; the time of last payment to enable the creation of a time based
    CO flag (e.g. helps determine if someone charges off after 18M or after); The charge off period you aim to make a 
    CO flag for
    Output: A 1/0 flag at the account level
    """
    if (loan_status == 'Charged Off' or loan_status == 'Default') and last_pymnt_d_R <= (n-6):
        return 1
    else:
        return 0

#Apply the charge off function to dataframe
df['CO18M'] = df.apply(lambda row: chargeOffFlag(row['loan_status'], row['last_pymnt_d_R'], 18), axis=1)

### Fix data type for revol_util and int_rate (object to float)

In [6]:
#Both fields were objects so needed to shift them to numbers and divide the answers by 100
df['int_rate'] = df.int_rate.replace('%','',regex=True).astype('float')/100
df['revol_util'] = df.revol_util.replace('%','',regex=True).astype('float')/100

### Drop Columns that we won't have at time of decisioning or judgementally want to exclude

In [7]:
#Dataset included post booking variables and variables that judgementally were excluded
irrelevantVariable = [
'funded_amnt',
'funded_amnt_inv',
'initial_list_status',
'pymnt_plan',
'out_prncp',
'out_prncp_inv',
'total_pymnt',
'total_pymnt_inv',
'total_rec_prncp',
'total_rec_int',
'total_rec_late_fee', 
'recoveries',
'collection_recovery_fee',
'last_pymnt_d',
'last_pymnt_amnt',
'next_pymnt_d', 
'last_credit_pull_d', 
'last_fico_range_high', 
'last_fico_range_low', 
'next_pymnt_d_R', 
'last_pymnt_d_R', 
'last_credit_pull_d_R',
'pymnt_plan',
'loan_status', #Function of target variable
'today', #Not relevant
'policy_code', #Has only a single case
'inPolicy', #Has only a single case
'earliest_cr_line', #Has a lot of different date values so dropping for now
#Following metrics could be helpful in the future with text analytics
'url',
'desc',
'title',
'emp_title',
'issue_d',
#Following metrics are all nulls as of January 2016
'annual_inc_joint',
'annual_inc_joint',
'dti_joint',
'verification_status_joint',
'open_acc_6m',
'open_il_6m',
'open_il_12m',
'open_il_24m',
'mths_since_rcnt_il',
'total_bal_il',
'il_util',
'open_rv_12m',
'open_rv_24m',
'max_bal_bc',
'all_util',
'inq_fi',
'total_fi_tl',
'inq_last_12m',
'total_cu_tl',
'open_act_il',
'revol_bal_joint',
'sec_app_fico_range_low',   
'sec_app_fico_range_high',   
'sec_app_earliest_cr_line',
'sec_app_inq_last_6mths',
'sec_app_mort_acc',
'sec_app_open_acc',
'sec_app_revol_util',
'sec_app_open_act_il',
'sec_app_num_rev_accts',    
'sec_app_chargeoff_within_12_mths',    
'sec_app_collections_12_mths_ex_med',
'sec_app_mths_since_last_major_derog',    
'deferral_term',    
'hardship_amount',    
'hardship_length',    
'hardship_dpd',    
'orig_projected_additional_accrued_interest',    
'hardship_payoff_balance_amount',   
'hardship_last_payment_amount',
'settlement_amount',    
'settlement_percentage',    
'settlement_term',    
'hardship_type',
'hardship_reason',
'hardship_status',
'hardship_start_date',
'hardship_end_date',
'payment_plan_start_date',   
'hardship_loan_status',
'debt_settlement_flag_date',    
'settlement_status',
'settlement_date',
'hardship_flag',
'debt_settlement_flag']

#Update dataframe with only the relevant variables
relevantVariable = []
for i in df.columns:
    if i not in irrelevantVariable:
        relevantVariable.append(i)
df = df[relevantVariable]

### Clean remaining variables

In [8]:
#Leveraged initial cap/floor/impute numeric variable code written by Randy C. from Capital One labs as a starting point
#for overall variable cleaning code

def createOneMinusList(df, listStart):
    """
    Input: Dataframe and a list that we will bump against the df to find the one minus of the df columns
    Output: List of variables that is the one minus of the initial list
    """
    oneMinusList = []
    for col in df.columns:
        if col not in listStart:
            oneMinusList.append(col)
    return oneMinusList

def cap_and_floor(series, floor=0.01, cap=0.99):
    """
    Takes in a numberic series (typically column from dataframe) and returns 4 series:
    1) The final capped, floored, and imputed initial series - 'series'
    2) A 1/0 flag for when an instance was floored - 'floored'
    3) A 1/0 flag for when an instance was capped - 'capped'
    4) A 1/0 flag for when an instance was imputed - 'imputed'
    
    Options:
    - "floor" percentile at which to floor instances
    - "cap" percentile at which to cap instances
    """
    impute = (pd.isnull(series)).apply(int)
    meanVal = series.mean()
    series = series.fillna(meanVal)
    
    floorVal = series.quantile(floor)
    floored = (series < floorVal).apply(int)
    series = series.apply(lambda x: x if x > floorVal else floorVal)
    
    capVal = series.quantile(cap)
    capped = (series > capVal).apply(int)
    series = series.apply(lambda x: x if x < capVal else capVal)
    
    return series, floored, capped, impute, meanVal, floorVal, capVal

def char_null_imput(series):
    """
    Takes in a an object series (typically column from dataframe) and returns 2 series:
    1) The final imputed initial series - 'series'
    2) A 1/0 flag for when an instance was imputed - 'imputed'
    """
    imputeVal = 'Null'
    impute = (pd.isnull(series)).apply(int)
    series = series.fillna(imputeVal)
    
    return series, impute, imputeVal 

def cleanDF(df, donotAlterList = []):
    """
    Takes in a dataframe and a list of fields not to alter (this would likely include your target variable 
    and variables you wouldn't want to cap/floor/impute)
    
    Outputs 
    1) A final cleaned dataframe all nulls should be imputed unless null fields come from list of fields not to 
    alter
    2) A dictionary that contains the variables treatments (cap/floor/impute values) by column
    """
    dfFinal = df[[]]
    d_treatment = {}
    dfDoNotAlter = df.loc[:,donotAlterList]
    alterList = createOneMinusList(df, donotAlterList)
    
    dfNumeric = df.loc[:,alterList].select_dtypes(include=['number'])
    for col in dfNumeric.columns:
        series, floor, capped, impute, meanVal, floorVal, capVal = cap_and_floor(dfNumeric[col])
        dfNumeric[col] = series
        dfNumeric[col+"_cap"] = capped
        dfNumeric[col+"_floor"] = floor
        dfNumeric[col+"_imputed"] = impute
        d_treatment[col] = {col+"_cap":capVal,col+"_floor":floorVal,col+"_imputed":meanVal}
    
    dfObject = df.loc[:,alterList].select_dtypes(include=['object'])
    for col in dfObject.columns:
        series, impute, imputeVal  = char_null_imput(dfObject[col])
        dfObject[col] = series
        dfObject[col+"_imputed"] = impute
        d_treatment[col] = {col+"_imputed":imputeVal}
        
    dfFinal = pd.concat([dfFinal, dfDoNotAlter, dfNumeric, dfObject], axis=1, join_axes=[dfFinal.index])
    return dfFinal, d_treatment

#Create a list of variables not to alter in any way
donotAlterList = ['member_id', 'loan_amnt', 'int_rate', 'installment', 'CO18M', 'loan_age']

#Apply above functions and return updated dataframe and treatment dictionary
dfFinal, d_treatment = cleanDF(df, donotAlterList)

### Transform all object variables to mean of target variable

In [9]:
def charCleanUp(seriesName, seriesTarget, dfContainer, d_treatment_charVars):
    """
    Input: A non-numberic series from a df, a target variable from a df, the df it came from, and a dictionary
    to use to capture variable treatment and applies changes to the df and dictionary (no return)
    Output: None
    """
    #Capture list of all unique variables in the series
    uniqueValue = np.unique(dfContainer[seriesName].values)
    meanDict = {}
    #Create dataframe with only the seriesName and seriesTarget
    dfSubset0 = dfContainer.loc[:,[seriesName, seriesTarget]]
    for i in uniqueValue:
        #Create a subset dictionary with only one unique field from list
        dfSubset = dfSubset0[dfSubset0[seriesName] == i]
        #Assign the mean seriesTarget to the dictionary
        meanDict[i] = round(dfSubset[seriesTarget].mean(),4)
    #Create a 'Missing' entry into the dictionary in case during scoring a ne field appears
    meanDict['Missing'] = round(dfSubset0[seriesTarget].mean(),4)
    #Apply dictionary on series
    dfContainer[seriesName] = dfContainer.apply(lambda row: meanDict[row[seriesName]], axis=1)
    #Add variable specific dictionary to broader treatment dictionary
    d_treatment_charVars[seriesName] = meanDict

def dfCharCleanUp(df, d_treatment_charVars, seriesTarget, donotAlterList = []):
    """
    Input: Dataframe, dictionary to capture variables treatments, the series target, and a list of columns not to
    include
    Output: None
    """
    alterList = createOneMinusList(df, donotAlterList)     
    dfObject = df.loc[:,alterList].select_dtypes(include=['object'])
    for col in dfObject.columns:
        charCleanUp(col, seriesTarget, df, d_treatment_charVars)

d_treatment_charVars = {}
dfCharCleanUp(dfFinal, d_treatment_charVars, 'CO18M', donotAlterList)

### Capture key variable means by Lending Club Grades to help track significant changes in data or catch errors during the scoring portion of this code

In [10]:
#Temporarily add back orginal grade to collect mean data at the Lending Club Grade group
dfFinal['grade_group'] = df['grade']

In [11]:
def captureVaribleMetrics(df, dictionary, variable, groupByVariable):
    """
    Input: Dataframe, dictionary to capture outputs, target column name, and group by column name
    Output: None
    """
    #Create a dataframe that captures mean levels by the groupByVariable
    dfTemp = df.groupby(groupByVariable)[variable].mean()
    dictMeans = {}
    for i in range (0,len(dfTemp)):
        #Capture means by groupByVariable in dictionary
        dictMeans[dfTemp.index[i]] = dfTemp[i]
    #Add dictionary to broader treatment dictionary
    dictionary[variable] = dictMeans

excludeColumns = [
    'member_id',
    'loan_age',
    'grade_group'
]

d_means_by_LC_grades = {}

# Apply defintiion on select columns
for i in dfFinal:
    if i not in excludeColumns and '_cap' not in i and '_floor' not in i and 'impute' not in i:
        captureVaribleMetrics(dfFinal,d_means_by_LC_grades,i,'grade_group')

In [12]:
#Remove 'grade_Group' from final df
dfFinal = dfFinal.drop('grade_group', 1)

### Capture all columns in final dataset and datatype (exclude target variable) - will be leveraged as part of scoring section to make sure all needed variables are captured

In [13]:
dTypeDict = {}

for i in dfFinal:
    if i not in ['CO18M']:
        dTypeDict[i] = str(dfFinal[i].dtype)

### Seperate data into build and validation samples

In [14]:
def createBuildAndValidationSamples(dfFinal, ageCutOff):
    """
    Generate Build and Validation datasets based on an age cutoff and using 3 months of bookings for the validaiton
    sample
    Input: The cleaned dataframe and the age cutoff
    """
    dfFinalBuild = dfFinal[(dfFinal.loan_age >= (ageCutOff+3))]
    dfFinalValidation = dfFinal[(dfFinal.loan_age >= ageCutOff) & (dfFinal.loan_age < (ageCutOff+3))]
    return dfFinalBuild, dfFinalValidation

In [15]:
#Generate build and validation samples based on target variables
dfFinalBuild, dfFinalValidation = createBuildAndValidationSamples(dfFinal, 18)

### Save cleaned build and validation datasets and data treatment

In [16]:
writeLocation = ''

dfFinalBuild.to_csv(writeLocation+"build_step1.csv",',')
dfFinalValidation.to_csv(writeLocation+"validation_step1.csv",',')

with open(writeLocation+'d_treatment_charVars.json', 'w') as fp:
    json.dump(d_treatment_charVars, fp)
    
with open(writeLocation+'d_treatment.json', 'w') as fp:
    json.dump(d_treatment, fp)
    
with open(writeLocation+'d_means_by_LC_grades.json', 'w') as fp:
    json.dump(d_means_by_LC_grades, fp)

with open(writeLocation+'dTypeDict.json', 'w') as fp:
    json.dump(dTypeDict, fp)

### Data cleaning code should pass the below reviews

#### Final Dataset has all numeric columns in preperation for Gradient Boosting

In [17]:
for i in dfFinal:
    if dfFinal[i].dtype not in ['float', 'int']:
        print(i + ' - ' + dfFinal[i].dtype)
else:
    print('Pass: All fields float or int dtype!')


Pass: All fields float or int dtype!


#### Review all variables that were imputed more than 5% of the time to get comfortable 

In [18]:
for i in dfFinal:
    mean = dfFinal[i].mean()
    if '_imputed' in i and mean > 0.05:
        print('Imputations: ' + i + ' - ' +  str(mean))

Imputations: mths_since_last_delinq_imputed - 0.5496270252292488
Imputations: mths_since_last_record_imputed - 0.8746192384240562
Imputations: mths_since_last_major_derog_imputed - 0.8020066201042112
Imputations: tot_coll_amt_imputed - 0.17824393025134225
Imputations: tot_cur_bal_imputed - 0.17824393025134225
Imputations: total_rev_hi_lim_imputed - 0.17824393025134225
Imputations: acc_open_past_24mths_imputed - 0.12480269027520417
Imputations: avg_cur_bal_imputed - 0.1782756053911088
Imputations: bc_open_to_buy_imputed - 0.13261589141762087
Imputations: bc_util_imputed - 0.13314645000871067
Imputations: mo_sin_old_il_acct_imputed - 0.20635297719917856
Imputations: mo_sin_old_rev_tl_op_imputed - 0.17824656984632278
Imputations: mo_sin_rcnt_rev_tl_op_imputed - 0.17824656984632278
Imputations: mo_sin_rcnt_tl_imputed - 0.17824393025134225
Imputations: mort_acc_imputed - 0.12480269027520417
Imputations: mths_since_recent_bc_imputed - 0.13177913980878775
Imputations: mths_since_recent_bc_dlq

#### Have necessary dicitonary to capture build capping, flooring, and imputations to review and save for the scoring portion of the code (Part 3) 

In [19]:
for i in d_treatment:
    print(i)
    print(d_treatment[i])
    print()

total_rev_hi_lim
{'total_rev_hi_lim_cap': 123000.0, 'total_rev_hi_lim_imputed': 29782.106916057164, 'total_rev_hi_lim_floor': 3400.0}

verification_status
{'verification_status_imputed': 'Null'}

mort_acc
{'mort_acc_floor': 0.0, 'mort_acc_imputed': 1.846621326135147, 'mort_acc_cap': 8.0}

total_il_high_credit_limit
{'total_il_high_credit_limit_imputed': 36811.90940161057, 'total_il_high_credit_limit_cap': 174095.55, 'total_il_high_credit_limit_floor': 0.0}

num_bc_sats
{'num_bc_sats_imputed': 4.652095168805436, 'num_bc_sats_cap': 12.0, 'num_bc_sats_floor': 1.0}

fico_range_high
{'fico_range_high_cap': 799.0, 'fico_range_high_imputed': 701.1033295851084, 'fico_range_high_floor': 664.0}

dti
{'dti_cap': 33.77, 'dti_imputed': 16.890464146381376, 'dti_floor': 1.61}

mo_sin_rcnt_tl
{'mo_sin_rcnt_tl_imputed': 8.505989033756371, 'mo_sin_rcnt_tl_floor': 0.0, 'mo_sin_rcnt_tl_cap': 45.0}

purpose
{'purpose_imputed': 'Null'}

mo_sin_rcnt_rev_tl_op
{'mo_sin_rcnt_rev_tl_op_floor': 0.0, 'mo_sin_rcnt

#### Have necessary dicitonary to capture Char treatment to also leverage in Part 3

In [20]:
for i in d_treatment_charVars:
    if len(d_treatment_charVars[i]) < 20:
        print(i + ' - ' + str(len(d_treatment_charVars[i])))
        print(d_treatment_charVars[i])
        print()

for i in d_treatment_charVars:
    if len(d_treatment_charVars[i]) >= 20:
        print(i + ' - ' + str(len(d_treatment_charVars[i])))
        print()

purpose - 15
{'vacation': 0.0631, 'wedding': 0.0508, 'home_improvement': 0.0457, 'house': 0.0592, 'debt_consolidation': 0.0535, 'moving': 0.0862, 'major_purchase': 0.0453, 'renewable_energy': 0.0816, 'educational': 0.0585, 'car': 0.0399, 'credit_card': 0.0392, 'small_business': 0.1062, 'Missing': 0.0517, 'other': 0.0711, 'medical': 0.0711}

grade - 8
{'D': 0.0764, 'G': 0.1469, 'F': 0.1253, 'Missing': 0.0517, 'E': 0.0971, 'C': 0.054, 'B': 0.0341, 'A': 0.017}

home_ownership - 6
{'NONE': 0.0444, 'RENT': 0.0608, 'OTHER': 0.0625, 'Missing': 0.0517, 'MORTGAGE': 0.0441, 'OWN': 0.0539}

verification_status - 4
{'Not Verified': 0.0429, 'Missing': 0.0517, 'Verified': 0.0563, 'Source Verified': 0.0551}

term - 3
{' 60 months': 0.0682, 'Missing': 0.0517, ' 36 months': 0.0457}

emp_length - 13
{'Missing': 0.0517, 'Null': 0.0676, '8 years': 0.0513, '1 year': 0.0543, '2 years': 0.0541, '3 years': 0.0516, '6 years': 0.0549, '4 years': 0.0529, '7 years': 0.052, '10+ years': 0.0452, '5 years': 0.0515, 

#### Review mean dictionary also to gut check new scoring datasets (to be incorporated into Part 3)

In [21]:
for i in d_means_by_LC_grades:
    print(i)
    print(d_means_by_LC_grades[i])
    print()

total_rev_hi_lim
{'D': 25554.95755221656, 'G': 27375.80085925098, 'F': 24965.863588767614, 'E': 25895.504961976796, 'C': 26871.231908690173, 'B': 29233.974430077134, 'A': 38964.59595787266}

verification_status
{'D': 0.05259049672887542, 'G': 0.055361925009661556, 'F': 0.05470652393641574, 'E': 0.05401691959389805, 'C': 0.05214694996029431, 'B': 0.050693643091334804, 'A': 0.0500547311267541}

loan_amnt
{'D': 15024.651885954285, 'G': 21292.413993042133, 'F': 18539.881007075255, 'E': 17725.704835685734, 'C': 14228.875595710882, 'B': 12899.66171853939, 'A': 13183.257829178503}

mort_acc
{'D': 1.673289273791782, 'G': 1.841581044728715, 'F': 1.6518593012250773, 'E': 1.7383026426699408, 'C': 1.7919836156046804, 'B': 1.808675232381551, 'A': 2.1237055140010135}

total_il_high_credit_limit
{'D': 35982.491146299704, 'G': 40937.70269855061, 'F': 37326.32136401208, 'E': 37307.9895804746, 'C': 35123.91867189425, 'B': 34910.02246973629, 'A': 39895.76236730003}

num_bc_sats
{'D': 4.507098904678877, '

#### Review Field types in final dataset to make sure scoring datasets will behave correctly (to be incorporated into Part 3)

In [22]:
print('Integers:')
for i in dTypeDict:
    if 'int' in dTypeDict[i]:
        print(i + ' - ' + dTypeDict[i])
print()
print('Float:')
for i in dTypeDict:
    if 'float' in dTypeDict[i]:
        print(i + ' - ' + dTypeDict[i])
print()
print('Other:')
for i in dTypeDict:
    if 'float' not in dTypeDict[i] and 'int' not in dTypeDict[i]:
        print(i + ' - ' + dTypeDict[i])

Integers:
pub_rec_imputed - int64
pct_tl_nvr_dlq_floor - int64
tot_hi_cred_lim_imputed - int64
num_il_tl_floor - int64
annual_inc_imputed - int64
open_acc_floor - int64
delinq_amnt_imputed - int64
mths_since_last_delinq_imputed - int64
percent_bc_gt_75_cap - int64
tot_hi_cred_lim_cap - int64
num_tl_90g_dpd_24m_cap - int64
bc_open_to_buy_floor - int64
num_accts_ever_120_pd_imputed - int64
sub_grade_imputed - int64
num_tl_op_past_12m_cap - int64
avg_cur_bal_floor - int64
mort_acc_floor - int64
pub_rec_bankruptcies_imputed - int64
mo_sin_old_rev_tl_op_imputed - int64
revol_bal_floor - int64
num_op_rev_tl_cap - int64
acc_now_delinq_cap - int64
total_il_high_credit_limit_imputed - int64
mo_sin_rcnt_rev_tl_op_imputed - int64
num_rev_tl_bal_gt_0_cap - int64
term_imputed - int64
num_bc_sats_floor - int64
acc_open_past_24mths_cap - int64
bc_util_cap - int64
mths_since_recent_revol_delinq_imputed - int64
inq_last_6mths_cap - int64
num_tl_op_past_12m_imputed - int64
num_rev_tl_bal_gt_0_floor - in

#### Graphically review target and Lending Club grade to make sure expected sloping is seen

In [23]:
# Have Bokeh outputs occur in notebook
output_notebook()

In [24]:
#Temporarily add back orginal grade to help review dataq
dfFinal['LC_Grades'] = df['grade']

In [25]:
view = dfFinal.groupby('LC_Grades').agg({'CO18M':['mean','size']})

In [26]:
p1 = figure(x_range=list(view.index.values), title="18M CO% by Lending Club Grades", plot_width=600, plot_height=300)
p2 = figure(x_range=list(view.index.values), title="Number of Loans by Lending Club Grade(k)", plot_width=600, plot_height=300)
p1.vbar(x=list(view.index.values), top=list(view['CO18M']['mean'].values), width=0.9)
p2.line(x=list(view.index.values), y=list((view['CO18M']['size']/1000).values))
show(p1)
show(p2)

In [27]:
#Remove 'grade_Group' from final df
dfFinal = dfFinal.drop('LC_Grades', 1)