# Some modeling and threshold testing 

In [45]:
# Standard imports
import sqlite3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.decomposition import PCA

import sklearn.pipeline
import sklearn.grid_search
import sklearn

%matplotlib inline

In [46]:
data = pd.read_csv('data/loans_toy.csv')

In [47]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887350 entries, 0 to 887349
Data columns (total 34 columns):
Unnamed: 0                       887350 non-null int64
annual_inc                       887350 non-null float64
dti                              887350 non-null float64
pub_rec                          887350 non-null float64
loan_amnt                        887350 non-null float64
binary_profit_loss               887350 non-null int64
profit_loss                      887350 non-null float64
earliest_cr_line_month           887350 non-null float64
earliest_cr_line_year            887350 non-null float64
issue_d_month                    887350 non-null int64
issue_d_year                     887350 non-null int64
home_ownership_MORTGAGE          887350 non-null int64
home_ownership_RENT              887350 non-null int64
home_ownership_OWN               887350 non-null int64
home_ownership_IS_OTHER_VALUE    887350 non-null int64
purpose_debt_consolidation       887350 non-null i

In [55]:
data.head()

Unnamed: 0.1,Unnamed: 0,annual_inc,dti,pub_rec,loan_amnt,binary_profit_loss,profit_loss,earliest_cr_line_month,earliest_cr_line_year,issue_d_month,...,purpose_vacation,purpose_house,purpose_wedding,purpose_renewable_energy,purpose_educational,purpose_IS_OTHER_VALUE,is_joint,grade_num,sub_grade_num,term_num
0,0,24000.0,27.65,0.0,5000.0,1,861.071414,1.0,1985.0,12,...,0,0,0,0,0,0,0,1,1,36
1,1,30000.0,1.0,0.0,2500.0,0,-1491.29,4.0,1999.0,12,...,0,0,0,0,0,0,0,2,2,60
2,2,12252.0,8.72,0.0,2400.0,1,603.653644,11.0,2001.0,12,...,0,0,0,0,0,0,0,2,2,36
3,3,49200.0,20.0,0.0,10000.0,1,2226.302212,2.0,1996.0,12,...,0,0,0,0,0,0,0,2,2,36
4,4,80000.0,17.94,0.0,3000.0,1,242.17,1.0,1996.0,12,...,0,0,0,0,0,0,0,1,1,60


# Subset data so i can test quickly

In [75]:
data_sub = data[:50000]

In [65]:
data_sub.head(1)

Unnamed: 0.1,Unnamed: 0,annual_inc,dti,pub_rec,loan_amnt,binary_profit_loss,profit_loss,earliest_cr_line_month,earliest_cr_line_year,issue_d_month,...,purpose_vacation,purpose_house,purpose_wedding,purpose_renewable_energy,purpose_educational,purpose_IS_OTHER_VALUE,is_joint,grade_num,sub_grade_num,term_num
0,0,24000.0,27.65,0.0,5000.0,1,861.071414,1.0,1985.0,12,...,0,0,0,0,0,0,0,1,1,36


In [76]:
y = data_sub['binary_profit_loss']
profit_loss = data_sub['profit_loss']

# data_sub_X = data_sub.drop('binary_profit_loss',axis=1)
# data_sub_X = data_sub_X.drop('profit_loss',axis=1)


In [66]:
data_sub.columns

Index([u'Unnamed: 0', u'annual_inc', u'dti', u'pub_rec', u'loan_amnt',
       u'binary_profit_loss', u'profit_loss', u'earliest_cr_line_month',
       u'earliest_cr_line_year', u'issue_d_month', u'issue_d_year',
       u'home_ownership_MORTGAGE', u'home_ownership_RENT',
       u'home_ownership_OWN', u'home_ownership_IS_OTHER_VALUE',
       u'purpose_debt_consolidation', u'purpose_credit_card',
       u'purpose_home_improvement', u'purpose_other',
       u'purpose_major_purchase', u'purpose_small_business', u'purpose_car',
       u'purpose_medical', u'purpose_moving', u'purpose_vacation',
       u'purpose_house', u'purpose_wedding', u'purpose_renewable_energy',
       u'purpose_educational', u'purpose_IS_OTHER_VALUE', u'is_joint',
       u'grade_num', u'sub_grade_num', u'term_num'],
      dtype='object')

In [77]:
X = data_sub[[u'Unnamed: 0', u'annual_inc', u'dti', u'pub_rec', u'loan_amnt', u'earliest_cr_line_month',
       u'earliest_cr_line_year', u'issue_d_month', u'issue_d_year',
       u'home_ownership_MORTGAGE', u'home_ownership_RENT',
       u'home_ownership_OWN', u'home_ownership_IS_OTHER_VALUE',
       u'purpose_debt_consolidation', u'purpose_credit_card',
       u'purpose_home_improvement', u'purpose_other',
       u'purpose_major_purchase', u'purpose_small_business', u'purpose_car',
       u'purpose_medical', u'purpose_moving', u'purpose_vacation',
       u'purpose_house', u'purpose_wedding', u'purpose_renewable_energy',
       u'purpose_educational', u'purpose_IS_OTHER_VALUE', u'is_joint',
       u'grade_num', u'sub_grade_num', u'term_num']]

In [69]:
X.head(1)

Unnamed: 0.1,Unnamed: 0,annual_inc,dti,pub_rec,loan_amnt,earliest_cr_line_month,earliest_cr_line_year,issue_d_month,issue_d_year,home_ownership_MORTGAGE,...,purpose_vacation,purpose_house,purpose_wedding,purpose_renewable_energy,purpose_educational,purpose_IS_OTHER_VALUE,is_joint,grade_num,sub_grade_num,term_num
0,0,24000.0,27.65,0.0,5000.0,1.0,1985.0,12,2011,0,...,0,0,0,0,0,0,0,1,1,36


In [72]:
y.head(1)

0    1
Name: binary_profit_loss, dtype: int64

# Function for thresholding

In [78]:
def adjust_threshold(thresh,pred_probas):
    predictions = []
    for pred in pred_probas:
        if pred > thresh:
            predictions.append(1)
        else:
            predictions.append(0)
    return predictions

# Pipeline with gridsearch for random forest

need to see about how to incorporate pca

In [26]:
# Split up the data
X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.33, random_state=42)

In [43]:
select = sklearn.feature_selection.SelectKBest(k=5)
clf = sklearn.ensemble.RandomForestClassifier()
pca = PCA(n_components=20)

# Define the steps for the pipeline
steps = [('pca', pca),
        ('feature_selection', select),
        ('random_forest', clf)]

# Declare the pipeline
pipeline_rf = sklearn.pipeline.Pipeline(steps)

# Define grid search parameters for the pipeline (need to match k best to the # of features)
# also adjust criterion, max_features
### NOTE the number of features in feature_selection ALWAYS has to be greater than the lowest value of 
### n_components in pca (because n_components is the # of components pca returns from the original set)
parameters_rf = dict(pca__n_components=[20,30], 
                      feature_selection__k=[10,'all'], 
                      random_forest__n_estimators=[100],
                      random_forest__min_samples_split=[10])

# perform the gridsearch with the pipelined model
model_rf = sklearn.grid_search.GridSearchCV(pipeline_rf, param_grid=parameters_rf)

model_rf.fit(X_train, y_train)

preds_probas = model_rf.predict_proba(X_test)
raw_preds = model_rf.predict(X_test)

# do thresholding here
thresh = 0.7
thresh_preds = adjust_threshold(thresh, preds_probas[:,1])

report_default = sklearn.metrics.classification_report(y_test, raw_preds)
report_custom = sklearn.metrics.classification_report(y_test,thresh_preds)

print report_default
print
print report_custom
print
print model_rf.best_estimator_

             precision    recall  f1-score   support

          0       0.63      0.39      0.48      3794
          1       0.84      0.93      0.88     12706

avg / total       0.79      0.81      0.79     16500


             precision    recall  f1-score   support

          0       0.49      0.61      0.54      3794
          1       0.87      0.81      0.84     12706

avg / total       0.79      0.77      0.77     16500


Pipeline(steps=[('pca', PCA(copy=True, n_components=30, whiten=False)), ('feature_selection', SelectKBest(k='all', score_func=<function f_classif at 0x116bcfb18>)), ('random_forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', m...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])


In [89]:
pd.DataFrame(thresh_preds,y_test)

Unnamed: 0_level_0,0
binary_profit_loss,Unnamed: 1_level_1
1,1
1,1
1,1
1,0
1,0
0,0
1,1
0,0
1,1
1,1


In [84]:
result = pd.DataFrame

numpy.ndarray

In [82]:
profit_loss_pred = pred_probas

pandas.core.series.Series

# Logistic Regression Pipeline and grid-search