# Lending Club Scoring Code

Part 3 of 3: This code should take the model from part 2 and score a group of new accounts for investment. This code mainly focuses on appropriately cleaning the variables in order to score. Tests throughout will make sure their are not dramatic data differences between the build/validation sample compared to newly scored candidates. Written by Tim Horan.

### Import necessary Python Packages

In [1]:
import pandas as pd
import time
import datetime
import numpy as np 
import json

from sklearn.externals import joblib

from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook

### Create DF of potential investments from downloaded CSVs

### Download dictionaries to transform data

In [2]:
prospects = pd.io.parsers.read_csv('primaryMarketNotes_browseNotes_1-RETAIL_20190210.csv', sep=',', index_col=False)
prospects = prospects.set_index(['id'])
prospects.head()

Unnamed: 0_level_0,member_id,loan_amnt,funded_amnt,term,int_rate,exp_default_rate,service_fee_rate,installment,grade,sub_grade,...,sec_app_earliest_cr_line,sec_app_inq_last_6mths,sec_app_mort_acc,sec_app_open_acc,sec_app_revol_util,sec_app_open_act_il,sec_app_num_rev_accts,sec_app_chargeoff_within_12_mths,sec_app_collections_12_mths_ex_med,sec_app_mths_since_last_major_derog
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
147097089,174847575,30000.0,27600.0,36,10.72,4.53,0.99,978.19,B,B2,...,,,,,,,,,,
146104068,172789083,35000.0,32650.0,36,7.02,1.8,0.84,1081.02,A,A2,...,,,,,,,,,,
147010891,174491813,27000.0,20475.0,36,10.72,4.53,0.99,880.37,B,B2,...,,,,,,,,,,
146318450,173317157,10000.0,8175.0,36,13.56,7.25,1.16,339.65,C,C1,...,07-14-2008 17:00:00,0.0,2.0,6.0,16.9,0.0,9.0,0.0,0.0,
147134449,175007260,22500.0,19000.0,60,11.8,4.84,0.71,498.23,B,B4,...,,,,,,,,,,


In [3]:
writeLocation = ''

with open(writeLocation+'d_treatment_charVars.json', 'r') as fp:
    d_treatment_charVars = json.load(fp)
    
with open(writeLocation+'d_treatment.json', 'r') as fp:
    d_treatment = json.load(fp)

with open(writeLocation+'d_means_by_LC_grades.json', 'r') as fp:
    d_means_by_LC_grades = json.load(fp)
    
with open(writeLocation+'dTypeDict.json', 'r') as fp:
    dTypeDict = json.load(fp)

### Add date fields

In [4]:
# Converting to a Date in Words - Year in Numbers format (Jan-2015)
def timeBetweenDatesM(timeZero, timeCompare):
    timeZeroF = pd.to_datetime(timeZero,format="%b-%Y")
    timeCompareF = pd.to_datetime(timeCompare,format="%b-%Y")
    return (timeCompareF - timeZeroF).astype('timedelta64[M]')

# Add column for today
prospects['today'] = datetime.date.today().strftime("%b-%Y")

#Fix earliest_cr_line varialbe
prospects['earliest_cr_line'] = pd.to_datetime(prospects['earliest_cr_line'])
prospects['earliest_cr_line'] = prospects['earliest_cr_line'].apply(lambda x: x.strftime("%b-%Y"))

# Apply definition
prospects['earliest_cr_line_R'] = timeBetweenDatesM(prospects['today'], prospects['earliest_cr_line'])

#Add a 0 loan_age field
prospects['loan_age'] = 0

### Fix data type (object to float)

In [5]:
prospects['revol_util'] = prospects['revol_util'].apply(lambda x: None if x == ' ' else x)
prospects['int_rate'] = prospects['int_rate'].apply(lambda x: None if x == ' ' else x)
prospects['mths_since_recent_revol_delinq'] = prospects['mths_since_recent_revol_delinq'].apply(lambda x: None if x == ' ' else x)
prospects['percent_bc_gt_75'] = prospects['percent_bc_gt_75'].apply(lambda x: None if x == ' ' else x)
prospects['bc_open_to_buy'] = prospects['bc_open_to_buy'].apply(lambda x: None if x == ' ' else x)
prospects['mths_since_recent_bc'] = prospects['mths_since_recent_bc'].apply(lambda x: None if x == ' ' else x)
prospects['num_tl_120dpd_2m'] = prospects['num_tl_120dpd_2m'].apply(lambda x: None if x == ' ' else x)
prospects['mths_since_recent_bc_dlq'] = prospects['mths_since_recent_bc_dlq'].apply(lambda x: None if x == ' ' else x)
prospects['bc_util'] = prospects['bc_util'].apply(lambda x: None if x == ' ' else x)
prospects['mo_sin_old_il_acct'] = prospects['mo_sin_old_il_acct'].apply(lambda x: None if x == ' ' else x)
prospects['mths_since_recent_inq'] = prospects['mths_since_recent_inq'].apply(lambda x: None if x == ' ' else x)

In [6]:
prospects['revol_util'] = prospects.revol_util.astype('float')/100
prospects['int_rate'] = prospects.int_rate.astype('float')/100
prospects['mths_since_recent_revol_delinq'] = prospects.mths_since_recent_revol_delinq.astype('float')
prospects['percent_bc_gt_75'] = prospects.percent_bc_gt_75.astype('float')
prospects['bc_open_to_buy'] = prospects.bc_open_to_buy.astype('float')
prospects['mths_since_recent_bc'] = prospects.mths_since_recent_bc.astype('float')
prospects['num_tl_120dpd_2m'] = prospects.num_tl_120dpd_2m.astype('float')
prospects['mths_since_recent_bc_dlq'] = prospects.mths_since_recent_bc_dlq.astype('float')
prospects['bc_util'] = prospects.bc_util.astype('float')
prospects['mo_sin_old_il_acct'] = prospects.mo_sin_old_il_acct.astype('float')
prospects['mths_since_recent_inq'] = prospects.mths_since_recent_inq.astype('float')

### Fix verification_status field

In [7]:
print(prospects['is_inc_v'].value_counts())

prospects['verification_status'] = prospects['is_inc_v']

print(prospects['verification_status'].value_counts())

Not Verified       64
Source Verified    48
Verified           12
Name: is_inc_v, dtype: int64
Not Verified       64
Source Verified    48
Verified           12
Name: verification_status, dtype: int64


### Fix fields that should be numeric

In [8]:
print(prospects['mths_since_last_delinq'].describe(include = 'all'))
print() 
print(prospects['mths_since_last_record'].describe(include = 'all'))
print()
print(prospects['mths_since_last_major_derog'].describe(include = 'all'))

prospects['mths_since_last_delinq'] = prospects['mths_since_last_delinq'].apply(lambda x: None if x == ' ' else x)
prospects['mths_since_last_delinq'] = prospects.mths_since_last_delinq.astype('float')

prospects['mths_since_last_record'] = prospects['mths_since_last_record'].apply(lambda x: None if x == ' ' else x)
prospects['mths_since_last_record'] = prospects.mths_since_last_record.astype('float')

prospects['mths_since_last_major_derog'] = prospects['mths_since_last_major_derog'].apply(lambda x: None if x == 'null' else x)
prospects['mths_since_last_major_derog'] = prospects.mths_since_last_major_derog.astype('float')

print()
print(prospects['mths_since_last_delinq'].describe(include = 'all'))
print()
print(prospects['mths_since_last_record'].describe(include = 'all'))
print()
print(prospects['mths_since_last_major_derog'].describe(include = 'all'))

count     124
unique     36
top          
freq       60
Name: mths_since_last_delinq, dtype: object

count     124
unique     12
top          
freq      112
Name: mths_since_last_record, dtype: object

count    39.000000
mean     42.179487
std      21.731461
min       8.000000
25%      23.500000
50%      46.000000
75%      58.500000
max      81.000000
Name: mths_since_last_major_derog, dtype: float64

count    64.000000
mean     28.703125
std      18.106983
min       2.000000
25%      16.750000
50%      25.500000
75%      38.000000
max      78.000000
Name: mths_since_last_delinq, dtype: float64

count     12.00000
mean      89.50000
std       21.33499
min       48.00000
25%       75.75000
50%       91.00000
75%      104.75000
max      118.00000
Name: mths_since_last_record, dtype: float64

count    39.000000
mean     42.179487
std      21.731461
min       8.000000
25%      23.500000
50%      46.000000
75%      58.500000
max      81.000000
Name: mths_since_last_major_derog, dtype: float

### Convert term variable back to string

In [9]:
print(prospects['term'].value_counts())

def termClean(term):
    if term == 36:
        return '36 months'
    elif term == 60:
        return '60 months'

prospects['term'] = prospects.apply(lambda row: termClean(row['term']), axis=1)

print()
print(prospects['term'].value_counts())

36    99
60    25
Name: term, dtype: int64

36 months    99
60 months    25
Name: term, dtype: int64


### Fix purpose field based on wording changes in scoring dataset

In [10]:
print(prospects['purpose'].value_counts())

def purposeClean(purpose):
    if purpose == 'Debt consolidation':
        return 'debt_consolidation'
    elif purpose == 'Credit card refinancing':
        return 'credit_card'
    elif purpose == 'Home improvement':
        return 'home_improvement'
    elif purpose == 'Other':
        return 'other'
    elif purpose == 'Major purchase':
        return 'major_purchase'
    elif purpose == 'Business':
        return 'small_business' 
    elif purpose == 'Medical expenses':
        return 'medical'
    elif purpose == 'Vacation':
        return 'vacation'
    elif purpose == 'Car financing':
        return 'car'
    elif purpose == 'Moving and relocation':
        return 'moving'    

prospects['purpose'] = prospects.apply(lambda row: purposeClean(row['purpose']), axis=1)

print()
print(prospects['purpose'].value_counts())

Debt consolidation         75
Credit card refinancing    27
Other                       6
Home improvement            4
Major purchase              3
Moving and relocation       2
Home buying                 2
Medical expenses            2
Business                    2
Car financing               1
Name: purpose, dtype: int64

debt_consolidation    75
credit_card           27
other                  6
home_improvement       4
major_purchase         3
small_business         2
moving                 2
medical                2
car                    1
Name: purpose, dtype: int64


### Drop Unnecessary Fields

In [11]:
keys = dTypeDict.keys()
baseVariables = []

for i in keys:
    if '_imputed' not in i and '_cap' not in i and '_floor' not in i:
        baseVariables.append(i)

baseVariables
prospects = prospects[baseVariables]

### Clean data leveraging dictionaries built in part 1

In [12]:
def createOneMinusList(df, listStart):
    """
    Input: Dataframe and a list that we will bump against the df to find the one minus of the df columns
    Output: List of variables that is the one minus of the initial list
    """
    oneMinusList = []
    for col in df.columns:
        if col not in listStart:
            oneMinusList.append(col)
    return oneMinusList

def apply_cap_and_floor(series, d_treatment):
    """
    Input: Numberic series (typically column from dataframe) and a dictionary of treatments for that column
    Output: Returns 4 series
    1) The final capped, floored, and imputed initial series - 'series'
    2) A 1/0 flag for when an instance was floored - 'floored'
    3) A 1/0 flag for when an instance was capped - 'capped'
    4) A 1/0 flag for when an instance was imputed - 'imputed'
    """
    dictionary = d_treatment[series.name]
    impute = (pd.isnull(series)).apply(int)
    meanVal = dictionary[series.name+str("_imputed")]
    series = series.fillna(meanVal)
    
    floorVal = dictionary[series.name+str("_floor")]
    floored = (series < floorVal).apply(int)
    series = series.apply(lambda x: x if x > floorVal else floorVal)
    
    capVal = dictionary[series.name+str("_cap")]
    capped = (series > capVal).apply(int)
    series = series.apply(lambda x: x if x < capVal else capVal)
    
    return series, floored, capped, impute

def apply_char_null_imput(series, d_treatment):
    """
    Input: Takes in a an object series (typically column from dataframe) and a dictionary of treatments for 
    that column
    Output: Returns 2 series
    1) The final imputed initial series - 'series'
    2) A 1/0 flag for when an instance was imputed - 'imputed'
    """
    dictionary = d_treatment[series.name]
    imputeVal = dictionary[series.name+str('_imputed')]
    impute = (pd.isnull(series)).apply(int)
    series = series.fillna(imputeVal)
    
    return series, impute

def apply_cleanDF(df, d_treatment, donotAlterList = []):
    """
    Input: A dataframe, a dictionary, and a list of fields not to alter (this would likely include your 
    target variable and variables you wouldn't want to cap/floor/impute)
    Output: A final cleaned dataframe all nulls should be imputed unless null fields come from list of fields not to 
    alter
    """
    dfFinal = df[[]]
    dfDoNotAlter = df.loc[:,donotAlterList]
    alterList = createOneMinusList(df, donotAlterList)
    
    dfNumeric = df.loc[:,alterList].select_dtypes(include=['number'])
    for col in dfNumeric.columns:
        series, floor, capped, impute = apply_cap_and_floor(dfNumeric[col], d_treatment)
        dfNumeric[col] = series
        dfNumeric[col+"_cap"] = capped
        dfNumeric[col+"_floor"] = floor
        dfNumeric[col+"_imputed"] = impute
    
    dfObject = df.loc[:,alterList].select_dtypes(include=['object'])
    for col in dfObject.columns:
        series, impute = apply_char_null_imput(dfObject[col], d_treatment)
        dfObject[col] = series
        dfObject[col+"_imputed"] = impute
        
    dfFinal = pd.concat([dfFinal, dfDoNotAlter, dfNumeric, dfObject], axis=1, join_axes=[dfFinal.index])
    return dfFinal

#Create a list of variables not to alter in any way
donotAlterList = []
for e in prospects.columns:
    if e not in d_treatment.keys():
        donotAlterList.append(e)

prospectsFinal = apply_cleanDF(prospects, d_treatment, donotAlterList)

In [13]:
def apply_charCleanUp(seriesName, dfContainer, d_treatment_charVars):    
    """
    Input: A non-numberic series from a df, the df it came from, and a dictionary to provide the necessary treatment
    needed to apply to the series in the df
    Output: None
    """
    meanDict = d_treatment_charVars[seriesName]
    uniqueValue = np.unique(dfContainer[seriesName].values)
    for i in uniqueValue:
        if i not in meanDict:
            meanDict[i] = meanDict['Missing']
    dfContainer[seriesName] = dfContainer.apply(lambda row: meanDict[row[seriesName]], axis=1)

def apply_dfCharCleanUp(df, d_treatment_charVars, donotAlterList = []):
    """
    Input: Dataframe, dictionary to provide variables treatments, and a list of columns not to alter
    Output: None
    """
    alterList = createOneMinusList(df, donotAlterList)     
    dfObject = df.loc[:,alterList].select_dtypes(include=['object'])
    for col in dfObject.columns:
        if col in d_treatment_charVars:
            apply_charCleanUp(col, df, d_treatment_charVars)

apply_dfCharCleanUp(prospectsFinal, d_treatment_charVars, donotAlterList)

### Download and score model

In [14]:
writeLocation = ''

gbaModel = joblib.load(writeLocation+'gbaModel.pkl') 

In [15]:
dTypeDict = {}

for i in prospectsFinal:
    dTypeDict[i] = str(prospectsFinal[i].dtype)

In [16]:
print('Integers:')
for i in dTypeDict:
    if 'int' in dTypeDict[i]:
        print(i + ' - ' + dTypeDict[i])
print()
print('Float:')
for i in dTypeDict:
    if 'float' in dTypeDict[i]:
        print(i + ' - ' + dTypeDict[i])
print()
print('Other:')
for i in dTypeDict:
    if 'float' not in dTypeDict[i] and 'int' not in dTypeDict[i]:
        print(i + ' - ' + dTypeDict[i])

Integers:
pub_rec_bankruptcies_floor - int64
tot_hi_cred_lim_cap - int64
num_bc_sats_cap - int64
mths_since_last_delinq_floor - int64
num_actv_bc_tl_imputed - int64
total_acc_imputed - int64
fico_range_high_cap - int64
mths_since_last_record_floor - int64
delinq_2yrs_floor - int64
total_bal_ex_mort_floor - int64
mo_sin_rcnt_rev_tl_op_floor - int64
num_tl_30dpd_cap - int64
acc_open_past_24mths_cap - int64
total_acc_floor - int64
emp_length_imputed - int64
total_bal_ex_mort_imputed - int64
num_op_rev_tl_floor - int64
earliest_cr_line_R_cap - int64
acc_now_delinq_floor - int64
annual_inc_floor - int64
num_il_tl_floor - int64
application_type_imputed - int64
revol_bal_floor - int64
percent_bc_gt_75_cap - int64
mort_acc_floor - int64
verification_status_imputed - int64
pct_tl_nvr_dlq_imputed - int64
num_rev_accts_cap - int64
open_acc_floor - int64
collections_12_mths_ex_med_imputed - int64
pub_rec_floor - int64
dti_floor - int64
num_rev_tl_bal_gt_0_imputed - int64
num_rev_tl_bal_gt_0_cap - 

In [18]:
exclude = ['member_id']

relevantVariable = []
for i in prospectsFinal.columns:
    if i not in exclude:
        relevantVariable.append(i)

xTest = prospectsFinal[relevantVariable]

prospectsFinal['GB_Score'] = gbaModel.predict_proba(xTest)[:,1]

In [19]:
#Create risk adjusted return metric (Predicted CO/APR)
prospectsFinal['sub_grade_RAR'] = prospectsFinal['sub_grade']/prospectsFinal['int_rate']
prospectsFinal['GB_Score_RAR'] = prospectsFinal['GB_Score']/prospectsFinal['int_rate']

In [20]:
# Have Bokeh outputs occur in notebook
output_notebook()

In [21]:
#Calculate means by sub_grade_group
prospectsFinal['graph_RAR_decile'] = (prospectsFinal['GB_Score_RAR'].argsort().argsort()/float(len(prospectsFinal))*50).astype(int)
graph_prospectsFinal = prospectsFinal.groupby(['graph_RAR_decile'])['sub_grade_RAR','GB_Score_RAR'].mean()

#Define x and y metrics
x = list(graph_prospectsFinal.index)
y2 = list(graph_prospectsFinal['GB_Score_RAR'])
y3 = list(graph_prospectsFinal['sub_grade_RAR'])

#Create graph figure
p = figure(x_range = (0,50), plot_width=900, title="18M CO/Interest Rate by GB Model 50-tiles")

#Add lines to figure
p.line(x,y2, color='green',legend='GB Model/Interest Rate')
p.line(x,y3, color='orange',legend='Lending Club Model/Interest Rate')

#Add labeling to figure
p.xaxis.axis_label = 'GB Model/Interest Rate 50-tiles'
p.yaxis.axis_label = '18M CO/Interest Rate'


#Plot figure
show(p)

### Data cleaning code should pass the below reviews before finalizing investment

In [22]:
#Assign final data frame
finalProspects = prospectsFinal

#### Verify that final table has all needed columns

In [24]:
current = finalProspects.columns
baseMissing = []
baseAvailable = []
capfloorimputeMissing = []
capfloorimputeAvailable = []

print('---- Base Missing Variables  ----')
for i in dTypeDict:
    if i not in current and '_imputed' not in i and '_cap' not in i and '_floor' not in i:
        print(i)
        baseMissing.append(i)
    elif i in current and '_imputed' not in i and '_cap' not in i and '_floor' not in i:
        baseAvailable.append(i)

print()
print('---- Cap/Floor/Impute Missing Variables ----')
for i in dTypeDict:
    if i not in current and ('_imputed' in i or '_cap' in i or '_floor' in i):
        print(i)
        capfloorimputeMissing.append(i)
    elif i in current and ('_imputed' in i or '_cap' in i or '_floor' in i):
        capfloorimputeAvailable.append(i)
print()
print()

if len(baseMissing) == 0 and len(capfloorimputeMissing) == 0:
    print('No Missing Variables!')
else:
    print('Missing Variable Counts')
    print('Base Missing Variables: '+ str(len(baseMissing)))
    print('Cap/Floor/Impute Missing Variables: '+ str(len(capfloorimputeMissing)))

---- Base Missing Variables  ----

---- Cap/Floor/Impute Missing Variables ----


No Missing Variables!


#### Verify that final table datatypes are correct for available fields

In [25]:
baseWrongDtype = []
baseRightDtype = []
capfloorimputeWrongDtype = []
capfloorimputeRightDtype = []

print('---- Base Variables Non Numberic (assuming variable is available) ----')
for i in baseAvailable:
    if 'int' not in str(finalProspects[i].dtype) and 'float' not in str(finalProspects[i].dtype):
        print(i)
        baseWrongDtype.append(i)
    elif 'int' in str(finalProspects[i].dtype) and 'float' in str(finalProspects[i].dtype):
        baseRightDtype.append(i)
print()
print('---- Cap/Floor/Impute Variables Non Numberic (assuming variable is available) ----')
for i in capfloorimputeAvailable:
    if 'int' not in str(finalProspects[i].dtype) and 'float' not in str(finalProspects[i].dtype):
        print(i)
        capfloorimputeWrongDtype.append(i)
    elif 'int' in str(finalProspects[i].dtype) and 'float' in str(finalProspects[i].dtype):
        capfloorimputeRightDtype.append(i)
print()     
if len(baseWrongDtype) == 0 and len(capfloorimputeWrongDtype) == 0:
    print('No wrong dtypes for available fields!')
else:
    print('Wrong dType Counts')
    print('Base Missing Variables: '+ str(len(baseWrongDtype)))
    print('Cap/Floor/Impute Missing Variables: '+ str(len(capfloorimputeWrongDtype)))

---- Base Variables Non Numberic (assuming variable is available) ----

---- Cap/Floor/Impute Variables Non Numberic (assuming variable is available) ----

No wrong dtypes for available fields!


#### Verify that final fields with that are available and have reasonable mean values

In [26]:
#Define funciton to reverse the char treatment dictionary and add back grade chars
def dictReverse (output, dictionary):
    """
    Input: Post treatment output and dictionary build defining the output 
    Output: The dictionary key needed to get the output
    """
    for i in dictionary:
        if dictionary[i] == output:
            return i
#Appl function to the sub_grade variable
prospectsFinal['grade_group'] = prospectsFinal.apply(lambda row: dictReverse(row['grade'],d_treatment_charVars['grade']), axis=1)

In [70]:
#Define a mean comparison function
def meanComparison(diff,dfMean,dictionaryMeans):
    """
    Input: A tolerance of deviation from prospects and the predefined dictionary, the prospects df appropriately grouped
    and averaged, and the mean dictionary
    Output: Returns nothings
    """
    print('Difference greater than ' + str(1+diff))
    print()
    for e in dfMean.columns:
        for i in dfMean.index:
            prospects = round(float(dfMean[dfMean.index == i][e]),4)
            expected = round(dictionaryMeans[e][i],4)
            if abs(prospects) > abs(expected)*(1+diff):
                ratio = round((prospects/expected),4)
                print(e + ' - '+ i + ' - ' + str(ratio))
                print('   Prospects: ' + str(prospects))
                print('   Expected:  ' + str(expected))
                print()
            elif abs(prospects) < abs(expected)*(1-diff):
                ratio = round((prospects/expected),4)
                print(e + ' - '+ i + ' - ' + str(ratio))
                print('   Prospects: ' + str(prospects))
                print('   Expected:  ' + str(expected))
                print()

#Map out varialbes that we should check means on            
include = list(d_means_by_LC_grades.keys())
include.append('grade_group')
include.remove('CO18M')

#Define Tolerance
diff = 0.50
diff2 = 0.25

#Define dataset
prospectsFinalMean = prospectsFinal[include].groupby('grade_group').mean()

#Run Function
meanComparison(diff,prospectsFinalMean,d_means_by_LC_grades)
# meanComparison(diff2,prospectsFinalMean,d_means_by_LC_grades)



Difference greater than 1.5

total_bal_ex_mort - A - 1.5719
   Prospects: 74021.0069
   Expected:  47091.5199

total_bc_limit - B - 1.7085
   Prospects: 33963.3333
   Expected:  19879.0947

num_tl_30dpd - A - 0.0
   Prospects: 0.0
   Expected:  0.0007

num_tl_30dpd - B - 0.0
   Prospects: 0.0
   Expected:  0.0005

num_tl_30dpd - C - 0.0
   Prospects: 0.0
   Expected:  0.0004

num_tl_30dpd - D - 0.0
   Prospects: 0.0
   Expected:  0.0004

num_tl_30dpd - E - 0.0
   Prospects: 0.0
   Expected:  0.0004

total_il_high_credit_limit - A - 1.5176
   Prospects: 60545.1569
   Expected:  39895.7624

tot_coll_amt - C - 1.8169
   Prospects: 174.1613
   Expected:  95.8585

tot_coll_amt - D - 1.506
   Prospects: 147.8562
   Expected:  98.1773

pub_rec - A - 0.0
   Prospects: 0.0
   Expected:  0.0391

pub_rec - B - 0.4883
   Prospects: 0.0667
   Expected:  0.1366

bc_open_to_buy - A - 1.584
   Prospects: 25340.949
   Expected:  15997.9919

bc_open_to_buy - B - 2.4057
   Prospects: 19808.2407
   Expect

### Apply hardcuts to get to final loans

In [56]:
#Capture only loans that have outsized returns compared to Lending Club rating
returnBuffer = 10

prospectsFinal['RAR_Comp'] = (prospectsFinal['GB_Score_RAR'] < (prospectsFinal['sub_grade_RAR']*returnBuffer))

prospectsSelect = prospectsFinal[(prospectsFinal['RAR_Comp'] == True)]

In [57]:
#Join initial character variables to enable hardcuts
prospectsSelectProfile = pd.merge(prospectsSelect, prospects, how = 'left', left_index=True, right_index=True)

In [58]:
#Exclude loans that are not 36 months long, revolving balance >= $25k, loan amount >= $15k
prospectsSelectProfile2 = prospectsSelectProfile[(prospectsSelectProfile['term_y'] == '36 months') & (prospectsSelectProfile['revol_bal_x'] < 25000) & (prospectsSelectProfile['loan_amnt_x'] < 15000)]

In [59]:
#Review profiling of loans that pass cuts
prospectsSelectProfile2[['loan_amnt_x', 'grade_y', 'int_rate_y','GB_Score','sub_grade_x', 'purpose_y', 'term_y', 'annual_inc_y', 'fico_range_high_y', 'revol_bal_y']]

Unnamed: 0_level_0,loan_amnt_x,grade_y,int_rate_y,GB_Score,sub_grade_x,purpose_y,term_y,annual_inc_y,fico_range_high_y,revol_bal_y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
146318450,10000.0,C,0.1356,0.329474,0.0487,debt_consolidation,36 months,20000.0,694,1761.0
147242934,10000.0,B,0.118,0.362797,0.0372,debt_consolidation,36 months,39000.0,684,6484.0
147278803,10000.0,D,0.1797,0.456482,0.0697,medical,36 months,36886.0,669,3949.0
147469664,5000.0,E,0.2534,0.706389,0.1031,moving,36 months,30480.0,664,7798.0
147475553,8000.0,E,0.234,0.428921,0.0836,debt_consolidation,36 months,34000.0,689,7820.0
147475060,6000.0,C,0.1614,0.463457,0.0573,credit_card,36 months,45000.0,699,8405.0
147478215,12000.0,C,0.1356,0.244295,0.0487,debt_consolidation,36 months,42000.0,744,6988.0
147481458,10000.0,C,0.1356,0.3156,0.0487,debt_consolidation,36 months,56542.0,694,6760.0
147403803,8000.0,E,0.2437,0.489358,0.0921,home_improvement,36 months,35000.0,704,21925.0
147474979,12000.0,B,0.1131,0.293783,0.0345,debt_consolidation,36 months,50000.0,699,7131.0


In [60]:
print(list(prospectsSelectProfile2.index))

[146318450, 147242934, 147278803, 147469664, 147475553, 147475060, 147478215, 147481458, 147403803, 147474979, 147358154, 147477987, 147310803, 147092614, 147471956, 147302841, 147346307]
