In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sb
pd.set_option("max.columns", 100)
pd.set_option("max.colwidth", 100)

In [2]:
df = pd.read_csv("loan_training_data.csv")

In [3]:
data_dict = pd.read_csv("the_data_dictionary.csv")
data_dict

Unnamed: 0,dtypes,name,description
0,float64,loan_amnt,"The listed amount of the loan applied for by the borrower. If at some point in time, the credit ..."
1,object,term,The number of payments on the loan. Values are in months and can be either 36 or 60.
2,float64,installment,The monthly payment owed by the borrower if the loan originates.
3,object,grade,LC assigned loan grade
4,object,emp_length,Employment length in years. Possible values are between 0 and 10 where 0 means less than one yea...
5,object,home_ownership,The home ownership status provided by the borrower during registration or obtained from the cred...
6,float64,annual_inc,The self-reported annual income provided by the borrower during registration.
7,object,verification_status,"Indicates if income was verified by LC, not verified, or if the income source was verified"
8,object,loan_status,Current status of the loan
9,object,purpose,A category provided by the borrower for the loan request.


In [4]:
df.head()

Unnamed: 0,loan_amnt,term,installment,grade,emp_length,home_ownership,annual_inc,verification_status,loan_status,purpose,dti,delinq_2yrs,open_acc,revol_bal,total_acc,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,avg_cur_bal,bc_util,mort_acc,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_rev_tl_bal_gt_0,num_tl_90g_dpd_24m,num_tl_op_past_12m,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,fico_average
0,15000.0,36 months,485.14,B,9 years,MORTGAGE,125000.0,Not Verified,1,debt_consolidation,17.22,0.0,11.0,37651.0,24.0,0.0,87483.0,58300.0,7953.0,53.6,2.0,0.0,2.0,7.0,2.0,8.0,7.0,0.0,1.0,87483.0,8500.0,45764.0,687.0
1,12250.0,60 months,295.37,C,7 years,RENT,35000.0,Source Verified,0,credit_card,19.51,0.0,9.0,12681.0,19.0,0.0,13938.0,19345.0,1742.0,69.1,0.0,0.0,5.0,5.0,7.0,16.0,5.0,0.0,1.0,13938.0,18345.0,14793.0,722.0
2,17000.0,36 months,556.48,B,10+ years,MORTGAGE,67000.0,Source Verified,1,debt_consolidation,21.26,0.0,14.0,27320.0,33.0,0.0,43035.0,43500.0,3074.0,66.2,1.0,0.0,4.0,6.0,6.0,12.0,6.0,0.0,0.0,43035.0,31800.0,27657.0,747.0
3,8250.0,36 months,263.01,B,9 years,OWN,29000.0,Not Verified,1,debt_consolidation,24.34,0.0,14.0,8253.0,22.0,0.0,46863.0,13600.0,3347.0,71.8,0.0,0.0,4.0,5.0,4.0,4.0,5.0,0.0,0.0,46863.0,11400.0,40279.0,702.0
4,7125.0,36 months,256.06,D,9 years,RENT,87000.0,Source Verified,0,house,13.92,2.0,12.0,2426.0,19.0,1215.0,22318.0,5400.0,1860.0,47.2,0.0,0.0,5.0,7.0,7.0,8.0,8.0,0.0,2.0,22318.0,4700.0,26334.0,677.0


In [5]:
X = df.drop("loan_status", axis=1)
y = df.loan_status

In [6]:
term_dummies = pd.get_dummies(X.term, prefix="term:")
grade_dummies = pd.get_dummies(X.grade, prefix="grade:")
home_dummies = pd.get_dummies(X.home_ownership, prefix="home:")
verified_dummies = pd.get_dummies(X.verification_status, prefix="verify:")
purpose_dummies = pd.get_dummies(X.purpose, prefix="purpose")
dummies = pd.concat([grade_dummies,home_dummies,purpose_dummies, term_dummies, verified_dummies], axis=1)
X.drop(["term", "grade", "home_ownership", "verification_status", "purpose"], axis=1, inplace=True)
X = pd.concat([X,dummies], axis=1)
X["emp_length"] = X.emp_length.str.replace(" years", "").str.replace(" year", "").str.replace("< 1", "0").str.replace("+", "").str.replace("n/a", "0")
X["emp_length"] = X.emp_length.astype(float)

In [7]:
X.head()

Unnamed: 0,loan_amnt,installment,emp_length,annual_inc,dti,delinq_2yrs,open_acc,revol_bal,total_acc,tot_coll_amt,tot_cur_bal,total_rev_hi_lim,avg_cur_bal,bc_util,mort_acc,num_accts_ever_120_pd,num_actv_bc_tl,num_actv_rev_tl,num_bc_sats,num_bc_tl,num_rev_tl_bal_gt_0,num_tl_90g_dpd_24m,num_tl_op_past_12m,total_bal_ex_mort,total_bc_limit,total_il_high_credit_limit,fico_average,grade:_A,grade:_B,grade:_C,grade:_D,grade:_E,grade:_F,grade:_G,home:_MORTGAGE,home:_NONE,home:_OTHER,home:_OWN,home:_RENT,purpose_car,purpose_credit_card,purpose_debt_consolidation,purpose_home_improvement,purpose_house,purpose_major_purchase,purpose_medical,purpose_moving,purpose_other,purpose_renewable_energy,purpose_small_business,purpose_vacation,purpose_wedding,term:_ 36 months,term:_ 60 months,verify:_Not Verified,verify:_Source Verified,verify:_Verified
0,15000.0,485.14,9.0,125000.0,17.22,0.0,11.0,37651.0,24.0,0.0,87483.0,58300.0,7953.0,53.6,2.0,0.0,2.0,7.0,2.0,8.0,7.0,0.0,1.0,87483.0,8500.0,45764.0,687.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1,12250.0,295.37,7.0,35000.0,19.51,0.0,9.0,12681.0,19.0,0.0,13938.0,19345.0,1742.0,69.1,0.0,0.0,5.0,5.0,7.0,16.0,5.0,0.0,1.0,13938.0,18345.0,14793.0,722.0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0
2,17000.0,556.48,10.0,67000.0,21.26,0.0,14.0,27320.0,33.0,0.0,43035.0,43500.0,3074.0,66.2,1.0,0.0,4.0,6.0,6.0,12.0,6.0,0.0,0.0,43035.0,31800.0,27657.0,747.0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0
3,8250.0,263.01,9.0,29000.0,24.34,0.0,14.0,8253.0,22.0,0.0,46863.0,13600.0,3347.0,71.8,0.0,0.0,4.0,5.0,4.0,4.0,5.0,0.0,0.0,46863.0,11400.0,40279.0,702.0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
4,7125.0,256.06,9.0,87000.0,13.92,2.0,12.0,2426.0,19.0,1215.0,22318.0,5400.0,1860.0,47.2,0.0,0.0,5.0,7.0,7.0,8.0,8.0,0.0,2.0,22318.0,4700.0,26334.0,677.0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0


In [8]:
#WITH TEST DATA

In [9]:
dft = pd.read_csv("loan_testing_data.csv").sample(frac=.2, random_state=1234)
dft.shape

(10000, 33)

In [10]:
Xt = dft.drop("loan_status", axis=1)
yt = dft.loan_status

Xt.shape

(10000, 32)

In [11]:
term_dummies_t = pd.get_dummies(Xt.term, prefix="term:")
grade_dummies_t = pd.get_dummies(Xt.grade, prefix="grade:")
home_dummies_t = pd.get_dummies(Xt.home_ownership, prefix="home:")
verified_dummies_t = pd.get_dummies(Xt.verification_status, prefix="verify:")
purpose_dummies_t = pd.get_dummies(Xt.purpose, prefix="purpose")
dummies_t = pd.concat([grade_dummies_t,home_dummies_t,purpose_dummies_t, term_dummies_t, verified_dummies_t], axis=1)
Xt.drop(["term", "grade", "home_ownership", "verification_status", "purpose"], axis=1, inplace=True)
Xt = pd.concat([Xt,dummies_t], axis=1)
Xt["emp_length"] = Xt.emp_length.str.replace(" years", "").str.replace(" year", "").str.replace("< 1", "0").str.replace("+", "").str.replace("n/a", "0")
Xt["emp_length"] = Xt.emp_length.astype(float)

In [12]:
for i in X.columns:
    if i not in Xt.columns:
        print i
for i in X.columns:
    if i not in Xt.columns:
        X.drop(i, axis=1, inplace=True)

home:_NONE


In [13]:
import numpy as np
np.size(X.columns)

56

In [14]:
np.size(Xt.columns)

56

In [15]:
from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
Xs = scale.fit_transform(X)
Xts = scale.transform(Xt)

In [16]:
from sklearn.metrics import confusion_matrix
#Profit calculator
def profit_calculator(y_test, y_pred):
    cm = confusion_matrix(y_test, y_pred)
    tp = cm[1,1]
    fp = cm[0,1]
    return 100*tp - 1000*fp

In [17]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score



In [18]:
parameters = [{'max_depth': [1,2,3,4],'n_estimators':[10,20,50,100,150]},{'learning_rate': [.50, .1,.15]}]

X_train, X_test, y_train, y_test = train_test_split(Xs, y, test_size=0.30, random_state=101)
model = GradientBoostingClassifier(random_state=42)
f1_scorer = make_scorer(f1_score)

In [35]:
'''
param_test1 = {'n_estimators':range(20,81,10)}
gsearch1 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',subsample=0.8,random_state=10), 
param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch1.fit(X_train,y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

param_test2 = {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}
gsearch2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=60, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = param_test2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsearch2.fit(X_train,y_train)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_
'''

best_model = GradientBoostingClassifier(learning_rate=0.005, n_estimators=1200,max_depth=9, min_samples_split=1200, min_samples_leaf=60, subsample=0.85, random_state=10, max_features=7,
warm_start=True)
best_model.fit(X_train,y_train)
best_model

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.005, loss='deviance', max_depth=9,
              max_features=7, max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=60,
              min_samples_split=1200, min_weight_fraction_leaf=0.0,
              n_estimators=1200, presort='auto', random_state=10,
              subsample=0.85, verbose=0, warm_start=True)

In [21]:
best_model = gsearch2.best_estimator_
best_model

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=7,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=1,
              min_samples_split=600, min_weight_fraction_leaf=0.0,
              n_estimators=60, presort='auto', random_state=10,
              subsample=0.8, verbose=0, warm_start=False)

In [None]:
#grid_obj = GridSearchCV(model, parameters, cv=5, scoring= f1_scorer)
#grid_obj.fit(X_train, y_train)
#best_model = grid_obj.best_estimator_
#best_model

In [36]:
y_prob_pred = best_model.predict_proba(X_test)
threshold_adjusted_preds = np.where(y_prob_pred>=0.9, 1, 0)
predictions = threshold_adjusted_preds[:,1]
profit_calculator(y_test, predictions)

189200

In [37]:
print f1_score(y_test, predictions)

0.450857643671


In [24]:
#TEST DATA

In [38]:
yt_prob_pred = best_model.predict_proba(Xts)
threshold_adjusted_test_preds = np.where(yt_prob_pred>=0.9, 1, 0)
test_predictions1 = threshold_adjusted_test_preds[:,1]
profit_calculator(yt, test_predictions1)

85800

In [30]:
from sklearn.metrics import classification_report,confusion_matrix
print(classification_report(y_test,predictions))

             precision    recall  f1-score   support

          0       0.23      0.91      0.37      5785
          1       0.93      0.29      0.44     24215

avg / total       0.80      0.41      0.43     30000



In [31]:
print(confusion_matrix(y_test,predictions))

[[ 5254   531]
 [17163  7052]]
