## Lending Club Model & Deployment

In [13]:
from sklearn.linear_model import LogisticRegression
import pandas as pd
import numpy as np

### Lending Club Dataset

In [14]:
! cd ~/repos/yhat/demos/heroku-demos/demo-lending-club/model
df = pd.read_csv("./LoanStats3a.csv", skiprows=1)
df_head = df.head()
df_head

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,apr,int_rate,installment,grade,...,num_il_tl,mo_sin_old_il_acct,num_actv_rev_tl,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,total_rev_hi_lim,num_rev_tl_bal_gt_0,num_op_rev_tl,tot_coll_amt,policy_code
0,54734,80364,25000,25000,19080.057198,36 months,13.62%,11.89%,829.1,B,...,,,,,,,,,,1
1,55742,114426,7000,7000,672.803839,36 months,11.40%,10.71%,228.22,B,...,,,,,,,,,,1
2,57245,138150,1200,1200,1200.0,36 months,16.22%,13.11%,40.5,C,...,,,,,,,,,,1
3,57416,139635,10800,10800,10691.551053,36 months,16.17%,13.57%,366.86,C,...,,,,,,,,,,1
4,58915,153417,7500,5025,557.087228,36 months,10.77%,10.08%,162.34,B,...,,,,,,,,,,1


### Remove records that have too many null vaulues

In [15]:
def is_poor_coverage(row):
    pct_null = float(row.isnull().sum()) / row.count()
    return pct_null < 0.8

df_head[df_head.apply(is_poor_coverage, axis=1)]
df = df[df.apply(is_poor_coverage, axis=1)]
df.head()

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,apr,int_rate,installment,grade,...,num_il_tl,mo_sin_old_il_acct,num_actv_rev_tl,mo_sin_old_rev_tl_op,mo_sin_rcnt_rev_tl_op,total_rev_hi_lim,num_rev_tl_bal_gt_0,num_op_rev_tl,tot_coll_amt,policy_code
0,54734,80364,25000,25000,19080.057198,36 months,13.62%,11.89%,829.1,B,...,,,,,,,,,,1
1,55742,114426,7000,7000,672.803839,36 months,11.40%,10.71%,228.22,B,...,,,,,,,,,,1
2,57245,138150,1200,1200,1200.0,36 months,16.22%,13.11%,40.5,C,...,,,,,,,,,,1
3,57416,139635,10800,10800,10691.551053,36 months,16.17%,13.57%,366.86,C,...,,,,,,,,,,1
4,58915,153417,7500,5025,557.087228,36 months,10.77%,10.08%,162.34,B,...,,,,,,,,,,1


### Create derived variables

In [16]:
df['year_issued'] = df.issue_d.apply(lambda x: int(x.split("-")[0]))
df_term = df[df.year_issued < 2012]

In [17]:
bad_indicators = [
    "Late (16-30 days)",
    "Late (31-120 days)",
    "Default",
    "Charged Off"
]

df_term['is_rent'] = df_term.home_ownership=="RENT"
df_term['is_bad'] = df_term.loan_status.apply(lambda x: x in bad_indicators)

### Fit Regression Model

In [18]:
features = ['last_fico_range_low', 'last_fico_range_high', 'is_rent']
glm = LogisticRegression()
glm.fit(df_term[features], df_term.is_bad)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [19]:
glm.predict_log_proba(df_term[features].head())

array([[ -1.73398053e-03,  -6.35820249e+00],
       [ -2.89878075e-02,  -3.55533886e+00],
       [ -1.43929363e-02,  -4.24820556e+00],
       [ -3.29025301e-02,  -3.43061188e+00],
       [ -2.54960081e-03,  -5.97309300e+00]])

### Converting to "FICO style" Score

In [20]:
def calculate_score(log_odds):
    # 300 baseline + (40 points equals double risk) * odds
    return 300 + (40 / np.log(2)) * (-log_odds)

In [21]:
probs = glm.predict_proba(df_term[features])[:,1]
log_probs = glm.predict_log_proba(df_term[features])[:,1]
scores = calculate_score(log_probs)
scores

array([ 666.91788821,  505.17078967,  545.15460388, ...,  323.39840404,
        317.45837188,  323.39840404])

### Deployment to ScienceOps

In [22]:
from yhat import Yhat, YhatModel

class LoanModel(YhatModel):
    REQUIREMENTS = [
        "openblas"
    ]
    def execute(self, data):
        data['is_rent'] = data['home_ownership']=="RENT"
        data = pd.DataFrame([data])
        data = data[features]
        prob = glm.predict_proba(data)[0][1]
        if prob > 0.3:
            decline_code = "Credit score too low"
        else:
            decline_code = ""
            
        output = {
            "prob_default": [prob],
            "decline_code": [decline_code]
        }

        # convert the log-odds into a score
        odds = glm.predict_log_proba(data)[0][1]
        score = calculate_score(odds)
        output['score'] = [score]
        print "OUTPUT FROM LOGS: " + str(output)
        
        return output

In [23]:
# local test
test = {
    "last_fico_range_low": 705,
    "last_fico_range_high": 732,
    "home_ownership": "MORTGAGE"
}

lm = LoanModel()
lm.execute(test)

OUTPUT FROM LOGS: {'score': [580.02875054481365], 'decline_code': [''], 'prob_default': [0.0078086087117855895]}


{'decline_code': [''],
 'prob_default': [0.0078086087117855895],
 'score': [580.02875054481365]}

In [12]:
yh = Yhat("demo-master", "4a662eb13647cfb9ed4ca36c5e95c7b3", 
          "https://sandbox.yhathq.com/")
yh.deploy("LendingClub", LoanModel, globals(), True)

extracting model
model specified requirements
requirements automatically detected
 [+] yhat==1.5.0
 [+] numpy==1.11.0
 [+] pandas==0.18.1
model variables
 [+] features <type 'list'> 67.5B
 [+] glm <class 'sklearn.linear_model.logistic.LogisticRegression'> 732.8B


Transfering Model: |                            |  0% ETA:  --:--:--   0.00 B/sTransfering Model: |###                         | 11% ETA:  00:00:00   3.49 K/sTransfering Model: |########################### | 97% ETA:  00:00:00  30.32 K/sTransfering Model: |########################### | 98% ETA:  00:00:00  30.31 K/sTransfering Model: |############################|100% Time: 00:00:00  14.68 K/s


{'message': 'Model successfully uploaded. Your model will begin building momentarily. Please see https://sandbox.yhathq.com/ for more details',
 'status': 'OK'}

In [18]:
columns = [
    "member_id",
    "last_fico_range_low",
    "last_fico_range_high",
    "home_ownership",
    "loan_amnt"
]
sample_input = df_term[columns]

In [53]:
import sys
import json

data = {"in": [], "out": []}
for _, row in sample_input.iterrows():
    row = row.to_dict()
    output = lm.execute(row)
    data['in'].append(row)
    data['out'].append(output)
data = pd.DataFrame(data)
data['in'] = data['in'].apply(json.dumps)
data['out'] = data['out'].apply(json.dumps)

data.to_csv("/Users/glamp/Downloads/lending-sample.csv", index=False)

In [None]:
from sklearn.tree import DecisionTreeClassifier
features = ['last_fico_range_low', 'last_fico_range_high', 'is_rent']
glm = LogisticRegression()
glm.fit(df_term[features], df_term.is_bad)
glm.pre