In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

%matplotlib inline

In [11]:
df = pd.read_csv("data/loan.csv")

In [12]:
# Works to drop columsn with less than 800,000 obs
columns = []
for column in df.columns:
    if df[column].count() > 800000:
        columns.append(column)
data = df[columns]

In [13]:
def discount_correction(amount):
    '''Correction for time-value of money goes here.
    Probably involves loan origination date and term...'''
    pass

df['profit_loss'] = df['total_pymnt'] - df['loan_amnt'].apply(discount_correction)

def binary_profit(value):
    if value > 0:
        return 1
    else:
        return 0
df['profit_loss'] = df['total_pymnt'] - df['loan_amnt']
df['binary_profit_loss'] = df['profit_loss'].apply(binary_profit)

In [15]:
df.drop('loan_status',axis=1,inplace=True)

In [17]:
import pandas as pd
import numpy as np
import re

def getyear(datestr):
    if type(datestr) == str:
        if len(datestr) > 7:
            return int(datestr[-4:])
    else:
        return np.NaN
        
def getmonth(datestr):
    if type(datestr) == str:
        if len(datestr) > 7:
            months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
            return months.index(datestr[:3]) + 1
        else:
            return np.NaN

def getemp(time_input):
    if type(time_input) == str:
        if time_input == 'n/a':
            return np.NaN
        else:
            num = int(re.sub("[^0-9]", "", time_input))
            if '<' in time_input:
                num = num - 1
        return int(num)
    else:
        return time_input
    
def getterm(term_input):
    if type(term_input) == str:
        if '6' in term_input:
            return 36 + 24*('60' in term_input)
    else:
        return np.NaN

def getgrade(grade):
    if type(grade) == str:
        letters = 'ABCDEFGHIJK'
        A = letters.find(grade[0])*6
        if len(grade) > 1:
            B = int(grade[1:])
        else:
            B = 0
            A = A/6
        return A + B
    else:
        return grade

def makebinaries(dataframe, column_name, max_values = False):
    names = dataframe[column_name].value_counts().index
    newcolnames, n = [], len(names)
    if max_values > 0 and max_values < n:
        names = names[:max_values]
    for name in names:
        newcolname = column_name + '_' + name
        dataframe[newcolname] = (dataframe[column_name] == name) + 0
        newcolnames.append(newcolname)
    if max_values < n:
        newcolname = column_name + '_IS_OTHER_VALUE'
        dataframe[newcolname] = 1 - (dataframe[column_name].isin(names))
        newcolnames.append(newcolname)
    return newcolnames

date_column_names = ['earliest_cr_line', 'issue_d','last_pymnt_d',
                     'next_pymnt_d', 'last_credit_pull_d']
date_column_names_pre_loan = ['earliest_cr_line', 'issue_d']
new_column_names_all, new_column_names_features = [], []
for name in date_column_names:
    nm, ny = name+'_month', name+'_year'
    df[nm] = df[name].apply(getmonth,1)
    df[ny] = df[name].apply(getyear,1)
    new_column_names_all.extend([nm, ny])
    if name in date_column_names_pre_loan:
        new_column_names_features.extend([nm, ny])
        
numerical_names = []
numerical_names.extend(makebinaries(df, 'home_ownership', max_values = 3))
numerical_names.extend(makebinaries(df, 'purpose'))

df['is_joint'] = (df['application_type'] == 'JOINT') + 0
df['grade_num'] = df['grade'].apply(getgrade,1)
df['sub_grade_num'] = df['grade'].apply(getgrade,1)
df['term_num'] = df['term'].apply(getterm,1)
df['months_loan_to_last_pay'] = (df.last_pymnt_d_year-df.issue_d_year)*12 + (df.last_pymnt_d_month-df.issue_d_month)
numerical_names.extend(['is_joint', 'grade_num', 'sub_grade_num','term_num'])

new_column_names_features.extend(numerical_names)
new_column_names_all.extend(numerical_names)
new_column_names_all.append('months_loan_to_last_pay')

In [31]:
# Columns without loan knowledge
columns = ['annual_inc','dti','pub_rec', 'loan_amnt','binary_profit_loss']

columns.extend(new_column_names_features)

In [32]:
columns

['annual_inc',
 'dti',
 'pub_rec',
 'loan_amnt',
 'binary_profit_loss',
 'earliest_cr_line_month',
 'earliest_cr_line_year',
 'issue_d_month',
 'issue_d_year',
 'home_ownership_MORTGAGE',
 'home_ownership_RENT',
 'home_ownership_OWN',
 'home_ownership_IS_OTHER_VALUE',
 'purpose_debt_consolidation',
 'purpose_credit_card',
 'purpose_home_improvement',
 'purpose_other',
 'purpose_major_purchase',
 'purpose_small_business',
 'purpose_car',
 'purpose_medical',
 'purpose_moving',
 'purpose_vacation',
 'purpose_house',
 'purpose_wedding',
 'purpose_renewable_energy',
 'purpose_educational',
 'purpose_IS_OTHER_VALUE',
 'is_joint',
 'grade_num',
 'sub_grade_num',
 'term_num']

In [33]:
data = df[columns]

In [34]:
data.dropna(inplace=True)
data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


Unnamed: 0,annual_inc,dti,pub_rec,loan_amnt,binary_profit_loss,earliest_cr_line_month,earliest_cr_line_year,issue_d_month,issue_d_year,home_ownership_MORTGAGE,...,purpose_vacation,purpose_house,purpose_wedding,purpose_renewable_energy,purpose_educational,purpose_IS_OTHER_VALUE,is_joint,grade_num,sub_grade_num,term_num
0,24000.0,27.65,0.0,5000.0,1,1.0,1985.0,12,2011,0,...,0,0,0,0,0,0,0,1,1,36
1,30000.0,1.0,0.0,2500.0,0,4.0,1999.0,12,2011,0,...,0,0,0,0,0,0,0,2,2,60
2,12252.0,8.72,0.0,2400.0,1,11.0,2001.0,12,2011,0,...,0,0,0,0,0,0,0,2,2,36
3,49200.0,20.0,0.0,10000.0,1,2.0,1996.0,12,2011,0,...,0,0,0,0,0,0,0,2,2,36
4,80000.0,17.94,0.0,3000.0,1,1.0,1996.0,12,2011,0,...,0,0,0,0,0,0,0,1,1,60


In [35]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 887350 entries, 0 to 887378
Data columns (total 32 columns):
annual_inc                       887350 non-null float64
dti                              887350 non-null float64
pub_rec                          887350 non-null float64
loan_amnt                        887350 non-null float64
binary_profit_loss               887350 non-null int64
earliest_cr_line_month           887350 non-null float64
earliest_cr_line_year            887350 non-null float64
issue_d_month                    887350 non-null int64
issue_d_year                     887350 non-null int64
home_ownership_MORTGAGE          887350 non-null int64
home_ownership_RENT              887350 non-null int64
home_ownership_OWN               887350 non-null int64
home_ownership_IS_OTHER_VALUE    887350 non-null int64
purpose_debt_consolidation       887350 non-null int64
purpose_credit_card              887350 non-null int64
purpose_home_improvement         887350 non-null int

In [39]:
data.to_csv('data/loans_toy.csv')

In [9]:
# columns = ['annual_inc', 'application_type', 'dti', 'emp_length', 'mortgage_home',
#            'owns_home', 'rents_home', 'other_home', 'term', 'pub_rec', 'loan_amnt','binary_profit_loss']
# X = df[columns]
# y = df['binary_pro']

KeyError: "['mortgage_home' 'owns_home' 'rents_home' 'other_home'] not in index"

In [11]:
y = pd.to_numeric(y)

In [12]:
y.dtype

dtype('int64')

# Trying out pipelines

In [14]:
import sklearn.pipeline
import sklearn.grid_search
import sklearn

In [15]:
select = sklearn.feature_selection.SelectKBest(k=5)
clf = sklearn.ensemble.RandomForestClassifier()

steps = [('feature_selection', select),
        ('random_forest', clf)]

pipeline = sklearn.pipeline.Pipeline(steps)

X_train, X_test, y_train, y_test = sklearn.cross_validation.train_test_split(X, y, test_size=0.33, random_state=42)

pipeline.fit(X_train, y_train)

preds_probas = pipeline.predict_proba(X_test)
preds = pipeline.predict(X_test)

report = sklearn.metrics.classification_report(y_test, preds )



In [16]:
print report

             precision    recall  f1-score   support

          0       0.27      0.16      0.20     14936
          1       0.83      0.90      0.86     66544

avg / total       0.73      0.77      0.74     81480



# Pipeline with grid search over random forest

In [17]:
parameters = dict(feature_selection__k=[5,10],
                 random_forest__n_estimators=[100,200,300,400,500,600],
                 random_forest__min_samples_split=[5,10,15,20,50,100])

cv = sklearn.grid_search.GridSearchCV(pipeline,param_grid=parameters)

cv.fit(X_train, y_train)
preds = cv.predict(X_test)
preds_proba = cv.predict_proba(X_test)
report = sklearn.metris.classification_report(y_test,preds)
print report

KeyboardInterrupt: 

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [14]:
random_forest = RandomForestClassifier(n_estimators=500,random_state=42, max_depth=10)
random_forest = random_forest.fit(X_train, y_train)
random_forest.score(X_test,y_test)


sorted(zip(X.columns,random_forest.feature_importances_), key=lambda tup: tup[1], reverse=True)

[('term', 0.28726025689307833),
 ('dti', 0.2260654480383589),
 ('annual_inc', 0.21421347168383265),
 ('loan_amnt', 0.14794474721423856),
 ('emp_length', 0.043808245394782661),
 ('pub_rec', 0.025468918266896831),
 ('rents_home', 0.025436104853232142),
 ('mortgage_home', 0.022735812792165622),
 ('owns_home', 0.0055394504956541041),
 ('other_home', 0.0015191347054284558),
 ('application_type', 8.4096623321016688e-06)]

In [15]:
random_forest.score(X_test,y_test)

0.81676184304672417

In [45]:
preds_proba = random_forest.predict_proba(X_test)
preds_abs = random_forest.predict(X_test)

In [40]:
preds_proba = random_forest.predict_proba(X_test)
preds = []
for pred in preds_proba[:,1]:
    if pred > 0.8:
        preds.append(1)
    else:
        preds.append(0)

In [24]:
from sklearn.metrics import confusion_matrix, classification_report

In [46]:
print type(preds_abs)
print type(preds)

<type 'numpy.ndarray'>
<type 'list'>


In [47]:
preds = np.asarray(preds)

In [48]:
preds.shape

(74073,)

In [49]:
preds_abs.shape

(74073,)

In [41]:
confusion_matrix(y_test, preds)

array([[ 6617,  6972],
       [16110, 44374]])

In [55]:
classification_report(y_test,preds_abs,target_names=['bad','good'])

'             precision    recall  f1-score   support\n\n        bad       0.60      0.00      0.01     13589\n       good       0.82      1.00      0.90     60484\n\navg / total       0.78      0.82      0.74     74073\n'

            precision    recall   f1-score   support
            
bad           0.60        0.00      0.01     13589

good          0.82        1.00      0.90     60484

avg/total     0.78        0.82        0.74       74073

In [53]:
classification_report(y_test,preds,target_names=['bad','good'])

'             precision    recall  f1-score   support\n\n        bad       0.29      0.49      0.36     13589\n       good       0.86      0.73      0.79     60484\n\navg / total       0.76      0.69      0.71     74073\n'

In [51]:
pd.crosstab(y_test, preds_abs, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,48,13541,13589
1,32,60452,60484
All,80,73993,74073


In [50]:
pd.crosstab(y_test, preds, rownames=['True'], colnames=['Predicted'], margins=True)

Predicted,0,1,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,6617,6972,13589
1,16110,44374,60484
All,22727,51346,74073


In [54]:
44374/float(60484)

0.7336485682163878

In [52]:
6617/float(13589)

0.48693796453013466