In [1]:
## Import warnings. Supress warnings (for  matplotlib)
import warnings
warnings.filterwarnings("ignore")

In [2]:
## Import analysis modules
import pandas as p
from pandas.tools.plotting import scatter_matrix
from numpy import nan, isnan, mean, std
from sklearn.cross_validation import train_test_split, cross_val_score, KFold, LeaveOneOut, LeavePOut, StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler, RobustScaler, Binarizer, Imputer, \
LabelEncoder, OneHotEncoder, scale
from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, roc_curve, auc

## Import datetime module
from datetime import datetime

## Import visualization modules
import matplotlib.pyplot as plt
import matplotlib.mlab as mlab

In [3]:
## Read in file
data = p.read_csv('F:\\DePaul\\IS467\\Week5\\loan.csv',delimiter='~}',na_values='nan',)

In [4]:
## Count of instances and features
rows, columns = data.shape
print data.shape

(42535, 112)


In [5]:
## Get basic statistics for continuous features
numeric = data.describe(include=['number']).T.reset_index()
numeric.rename(columns={'index':'feature'},inplace=True)
numeric.insert(1,'missing',(rows - numeric['count'])/ float(rows))

In [6]:
## How many features can we eliminate?
drop = numeric[(numeric['missing']==1) | (numeric['std']==0)]

In [7]:
## Drop the unhelpful features from the base and numeric table
data = data.drop(drop['feature'],axis=1)
numeric = numeric[~numeric['feature'].isin(drop['feature'])]

In [8]:
## Get basic statistics for discrete features
discrete = data.describe(include=['object']).T.reset_index()
discrete.rename(columns={'index':'feature'},inplace=True)
discrete.insert(1,'missing',(rows - discrete['count'])/ float(rows))

In [9]:
## How many features can we eliminate?
ddrop = discrete[(discrete['missing']>.6) | (discrete['unique']==1)]

In [10]:
## Drop unhelpful features from the base table
data = data.drop(ddrop['feature'],axis=1)
discrete = discrete[~discrete['feature'].isin(ddrop['feature'])]

In [11]:
## How many columns do we have left?
data.shape

(42535, 51)

In [12]:
## Check missing cases in delinq_2yrs
data['delinq_2yrs'].isnull().any()

True

In [13]:
## Impute missing cases using scikit learn
imp = Imputer(missing_values='NaN',strategy='median',axis=0)
data['delinq_2yrs'] = imp.fit_transform(data['delinq_2yrs'].reshape(-1,1))

In [14]:
## Check missing cases in delinq_2yrs
data['delinq_2yrs'].isnull().any()

False

In [15]:
## Address by stripping leading space
data['term'] = data['term'].str.strip()

In [16]:
## Keep only those loan statuses where fully paid or charged off
data = data[data['loan_status'].isin(['Fully Paid','Charged Off'])]

In [17]:
## Can also impute missing cases using pandas
data.fillna(data.median(),inplace=True)

Unnamed: 0,id,member_id,loan_amnt,funded_amnt,funded_amnt_inv,term,int_rate,installment,grade,sub_grade,...,total_rec_late_fee,recoveries,collection_recovery_fee,last_pymnt_d,last_pymnt_amnt,last_credit_pull_d,acc_now_delinq,delinq_amnt,pub_rec_bankruptcies,tax_liens
0,1077501,1296599,5000,5000,4975.000000,36 months,10.65%,162.87,B,B2,...,0.00,0.00,0.0000,Jan-2015,171.62,Jul-2016,0.0,0.0,0.0,0.0
1,1077430,1314167,2500,2500,2500.000000,60 months,15.27%,59.83,C,C4,...,0.00,117.08,1.1100,Apr-2013,119.66,Sep-2013,0.0,0.0,0.0,0.0
2,1077175,1313524,2400,2400,2400.000000,36 months,15.96%,84.33,C,C5,...,0.00,0.00,0.0000,Jun-2014,649.91,Jul-2016,0.0,0.0,0.0,0.0
3,1076863,1277178,10000,10000,10000.000000,36 months,13.49%,339.31,C,C1,...,16.97,0.00,0.0000,Jan-2015,357.48,Apr-2016,0.0,0.0,0.0,0.0
5,1075269,1311441,5000,5000,5000.000000,36 months,7.90%,156.46,A,A4,...,0.00,0.00,0.0000,Jan-2015,161.03,Jan-2016,0.0,0.0,0.0,0.0
6,1069639,1304742,7000,7000,7000.000000,60 months,15.96%,170.08,C,C5,...,0.00,0.00,0.0000,May-2016,1313.76,May-2016,0.0,0.0,0.0,0.0
7,1072053,1288686,3000,3000,3000.000000,36 months,18.64%,109.43,E,E1,...,0.00,0.00,0.0000,Jan-2015,111.34,Dec-2014,0.0,0.0,0.0,0.0
8,1071795,1306957,5600,5600,5600.000000,60 months,21.28%,152.39,F,F2,...,0.00,189.06,2.0900,Apr-2012,152.39,Aug-2012,0.0,0.0,0.0,0.0
9,1071570,1306721,5375,5375,5350.000000,60 months,12.69%,121.45,B,B5,...,0.00,269.29,2.5200,Nov-2012,121.45,Mar-2013,0.0,0.0,0.0,0.0
10,1070078,1305201,6500,6500,6500.000000,60 months,14.65%,153.45,C,C3,...,0.00,0.00,0.0000,Jun-2013,1655.54,Dec-2015,0.0,0.0,0.0,0.0


In [18]:
## Leverage regular expressions to clean revol_util and int_rate
data['revol_util'].replace('%','',regex=True,inplace=True)
data['int_rate'].replace('%','',regex=True,inplace=True)

In [19]:
## Create a final frame
final_frame = data.loc[:,['loan_amnt','term','int_rate','annual_inc','installment','verification_status','grade','revol_util','loan_status']]

In [20]:
## Double check feature data types 
final_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38954 entries, 0 to 39785
Data columns (total 9 columns):
loan_amnt              38954 non-null int64
term                   38954 non-null object
int_rate               38954 non-null object
annual_inc             38954 non-null float64
installment            38954 non-null float64
verification_status    38954 non-null object
grade                  38954 non-null object
revol_util             38904 non-null object
loan_status            38954 non-null object
dtypes: float64(2), int64(1), object(6)
memory usage: 3.0+ MB


In [21]:
## Convert revol_util to numeric 
final_frame['revol_util'] = p.to_numeric(final_frame['revol_util'])
final_frame['int_rate'] = p.to_numeric(final_frame['int_rate'])

In [22]:
## Fill na (when these were objects, fillna didn't have an impact)
final_frame['revol_util'].fillna(final_frame['revol_util'].median(),inplace=True)
final_frame['int_rate'].fillna(final_frame['int_rate'].median(),inplace=True)

In [23]:
## Check data types one more time
final_frame.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 38954 entries, 0 to 39785
Data columns (total 9 columns):
loan_amnt              38954 non-null int64
term                   38954 non-null object
int_rate               38954 non-null float64
annual_inc             38954 non-null float64
installment            38954 non-null float64
verification_status    38954 non-null object
grade                  38954 non-null object
revol_util             38954 non-null float64
loan_status            38954 non-null object
dtypes: float64(4), int64(1), object(4)
memory usage: 3.0+ MB


In [24]:
## Scikit learn estimators require numeric features
term_map = {'36 months':0,'60 months':1}
grade_map = {'A':6,'B':5,'C':4,'D':3,'E':2,'F':1,'G':0}
status_map = {'Fully Paid':0,'Charged Off':1}
verified_map = {'Not Verified':0,'Verified':1,'Source Verified':2}

In [25]:
## Convert categorical features to numeric using mapping function
final_frame['term'] = final_frame['term'].map(term_map)
final_frame['grade'] = final_frame['grade'].map(grade_map)
final_frame['loan_status'] = final_frame['loan_status'].map(status_map)
final_frame['verification_status'] = final_frame['verification_status'].map(verified_map)

In [26]:
## Checkout new data 
final_frame.head()

Unnamed: 0,loan_amnt,term,int_rate,annual_inc,installment,verification_status,grade,revol_util,loan_status
0,5000,0,10.65,24000.0,162.87,1,5,83.7,0
1,2500,1,15.27,30000.0,59.83,2,4,9.4,1
2,2400,0,15.96,12252.0,84.33,0,4,98.5,0
3,10000,0,13.49,49200.0,339.31,2,4,21.0,0
5,5000,0,7.9,36000.0,156.46,2,6,28.3,0


In [27]:
## Let's play around with scale; check feature mean and std
print '%0.2f, %0.2f' % (final_frame['installment'].mean(), final_frame['installment'].std())

322.90, 208.56


In [28]:
## Scale the feature using standard scaler
scaled = scale(final_frame['installment'])

In [29]:
## Check mean and standard deviation
print '%0.2f, %0.2f' % (scaled.mean(), scaled.std())

0.00, 1.00


In [30]:
## Scale the feature using MinMaxScaler 
minmax = MinMaxScaler().fit_transform(final_frame['installment'].reshape(-1, 1))

In [31]:
## See feature is now scaled to [0,1]
minmax, min(minmax), max(minmax)

(array([[ 0.11413726],
        [ 0.03423032],
        [ 0.05322993],
        ..., 
        [ 0.10946103],
        [ 0.10832881],
        [ 0.18591702]]), array([ 0.]), array([ 1.]))

In [32]:
## Check out annual income
final_frame['annual_inc'].values 

array([  24000.,   30000.,   12252., ...,  100000.,  200000.,   22000.])

In [33]:
## Binarize a feature using Binarizer
binar =  Binarizer(threshold=25000).fit_transform(final_frame['annual_inc'].reshape(-1, 1))

In [34]:
## See the feature is now binary
binar, min(binar), max(binar)

(array([[ 0.],
        [ 1.],
        [ 0.],
        ..., 
        [ 1.],
        [ 1.],
        [ 0.]]), array([ 0.]), array([ 1.]))

In [35]:
## Scale feature using RobustScaler
rb = RobustScaler().fit_transform(final_frame['annual_inc'].reshape(-1, 1))

In [36]:
## See the feature is now transformed
rb, min(rb), max(rb)

(array([[-0.83333333],
        [-0.69047619],
        [-1.11304762],
        ..., 
        [ 0.97619048],
        [ 3.35714286],
        [-0.88095238]]), array([-1.30952381]), array([ 141.45238095]))

In [37]:
## Seperate input features from target feature
x = final_frame.drop('loan_status',1).as_matrix()
y = final_frame['loan_status'].as_matrix()

In [38]:
## Take a look at x
x

array([[  5.00000000e+03,   0.00000000e+00,   1.06500000e+01, ...,
          1.00000000e+00,   5.00000000e+00,   8.37000000e+01],
       [  2.50000000e+03,   1.00000000e+00,   1.52700000e+01, ...,
          2.00000000e+00,   4.00000000e+00,   9.40000000e+00],
       [  2.40000000e+03,   0.00000000e+00,   1.59600000e+01, ...,
          0.00000000e+00,   4.00000000e+00,   9.85000000e+01],
       ..., 
       [  5.00000000e+03,   0.00000000e+00,   8.07000000e+00, ...,
          0.00000000e+00,   6.00000000e+00,   1.94000000e+01],
       [  5.00000000e+03,   0.00000000e+00,   7.43000000e+00, ...,
          0.00000000e+00,   6.00000000e+00,   7.00000000e-01],
       [  7.50000000e+03,   0.00000000e+00,   1.37500000e+01, ...,
          0.00000000e+00,   2.00000000e+00,   5.15000000e+01]])

In [39]:
## Take a look at y
y

array([0, 1, 0, ..., 0, 0, 0], dtype=int64)

In [40]:
## Lets's try to extract components via PCA 
pca = PCA(n_components=3)
pca.fit(x)

PCA(copy=True, n_components=3, whiten=False)

In [41]:
## Percentage of variance explained by each of the selected components.
print(['%0.2f' % z for z in pca.explained_variance_ratio_]) 

['0.99', '0.01', '0.00']


In [42]:
## Transform x
x_transformed = pca.transform(x)

In [43]:
## Take a look at x
x_transformed

array([[ -4.49781217e+04,  -4.68505440e+03,   3.85182952e+00],
       [ -3.90598022e+04,  -7.37443346e+03,  -3.70343575e+01],
       [ -5.68021395e+04,  -6.91533625e+03,  -5.23776410e+00],
       ..., 
       [  3.09842534e+04,  -7.07618121e+03,  -9.46438557e+00],
       [  1.30934753e+05,  -1.02222145e+04,  -1.78521134e+01],
       [ -4.68984313e+04,  -2.12181401e+03,   2.97045985e+01]])

In [44]:
## Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.3,random_state=1)

In [45]:
## Take a look at the shape
x_train.shape, y_train.shape

((27267L, 8L), (27267L,))

In [46]:
## Splitting the data via KFold (shuffle data before split)
kf = KFold(len(x),n_folds=15,shuffle=True)

In [47]:
## Show indices
for train_index, test_index in kf:
   print("TRAIN:", train_index[0:5], "TEST:", test_index[0:5])

('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([ 11,  26,  43,  79, 101]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([12, 28, 57, 61, 88]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([ 7, 21, 29, 36, 72]))
('TRAIN:', array([0, 1, 3, 4, 5]), 'TEST:', array([ 2, 20, 39, 49, 63]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([22, 27, 30, 33, 41]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([23, 31, 60, 69, 91]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([  9, 113, 126, 155, 174]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([24, 37, 42, 44, 48]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([14, 17, 51, 64, 71]))
('TRAIN:', array([1, 2, 3, 4, 5]), 'TEST:', array([ 0,  6, 10, 19, 38]))
('TRAIN:', array([0, 2, 5, 6, 7]), 'TEST:', array([ 1,  3,  4,  8, 58]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([ 5, 13, 16, 35, 52]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([62, 83, 84, 92, 95]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:

In [48]:
## Splitting the data via LeaveOneOut
loo = LeaveOneOut(10)

In [49]:
## Show indices
for train_index, test_index in loo:
   print("TRAIN:", train_index[0:5], "TEST:", test_index)

('TRAIN:', array([1, 2, 3, 4, 5]), 'TEST:', array([0]))
('TRAIN:', array([0, 2, 3, 4, 5]), 'TEST:', array([1]))
('TRAIN:', array([0, 1, 3, 4, 5]), 'TEST:', array([2]))
('TRAIN:', array([0, 1, 2, 4, 5]), 'TEST:', array([3]))
('TRAIN:', array([0, 1, 2, 3, 5]), 'TEST:', array([4]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([5]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([6]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([7]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([8]))
('TRAIN:', array([0, 1, 2, 3, 4]), 'TEST:', array([9]))


In [50]:
## Get 10 samples 
lpo = LeavePOut(10,p=5)

In [51]:
## Show indices
for train_index, test_index in lpo:
   print("TRAIN:", train_index, "TEST:", test_index)

('TRAIN:', array([5, 6, 7, 8, 9]), 'TEST:', array([0, 1, 2, 3, 4]))
('TRAIN:', array([4, 6, 7, 8, 9]), 'TEST:', array([0, 1, 2, 3, 5]))
('TRAIN:', array([4, 5, 7, 8, 9]), 'TEST:', array([0, 1, 2, 3, 6]))
('TRAIN:', array([4, 5, 6, 8, 9]), 'TEST:', array([0, 1, 2, 3, 7]))
('TRAIN:', array([4, 5, 6, 7, 9]), 'TEST:', array([0, 1, 2, 3, 8]))
('TRAIN:', array([4, 5, 6, 7, 8]), 'TEST:', array([0, 1, 2, 3, 9]))
('TRAIN:', array([3, 6, 7, 8, 9]), 'TEST:', array([0, 1, 2, 4, 5]))
('TRAIN:', array([3, 5, 7, 8, 9]), 'TEST:', array([0, 1, 2, 4, 6]))
('TRAIN:', array([3, 5, 6, 8, 9]), 'TEST:', array([0, 1, 2, 4, 7]))
('TRAIN:', array([3, 5, 6, 7, 9]), 'TEST:', array([0, 1, 2, 4, 8]))
('TRAIN:', array([3, 5, 6, 7, 8]), 'TEST:', array([0, 1, 2, 4, 9]))
('TRAIN:', array([3, 4, 7, 8, 9]), 'TEST:', array([0, 1, 2, 5, 6]))
('TRAIN:', array([3, 4, 6, 8, 9]), 'TEST:', array([0, 1, 2, 5, 7]))
('TRAIN:', array([3, 4, 6, 7, 9]), 'TEST:', array([0, 1, 2, 5, 8]))
('TRAIN:', array([3, 4, 6, 7, 8]), 'TEST:', arra

In [52]:
## Count the number of splits
print len(lpo)

252


In [53]:
## Splitting via StratifiedShuffleSplit
sss = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=1)

In [54]:
## Show indices
for train_index, test_index in sss:
   print("TRAIN:", train_index[0:3], "TEST:", test_index[0:3])

('TRAIN:', array([12747, 33149, 28034], dtype=int64), 'TEST:', array([20861, 35208, 37632], dtype=int64))
('TRAIN:', array([13771, 22368, 19473], dtype=int64), 'TEST:', array([10011, 10817, 30226], dtype=int64))
('TRAIN:', array([10067, 24441,  7808], dtype=int64), 'TEST:', array([18242, 37624,  5061], dtype=int64))


In [55]:
## Create estimator
clf = DecisionTreeClassifier()

In [56]:
## Fit the model using training set 
clf.fit(x_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [57]:
## Check accuracy score
print '%0.2f' % clf.score(x_test,y_test)

0.76


In [58]:
## Run 10 fold cross validation
cvs = cross_val_score(clf,x,y,cv=10)

In [59]:
## Show cross validation scores
cvs

array([ 0.75      ,  0.75616016,  0.75590349,  0.74537988,  0.72631579,
        0.73530167,  0.7283697 ,  0.75969191,  0.77021823,  0.76508344])

In [60]:
## Show cross validation score mean and std
print '%0.2f, %0.2f' % (cvs.mean(), cvs.std())

0.75, 0.01


In [61]:
## Check feature importances ("mean decrease impurity")
clf.feature_importances_

array([ 0.08706964,  0.02865672,  0.15935965,  0.21420771,  0.19365493,
        0.05103071,  0.01836172,  0.24765892])

In [62]:
## Plot feature importances
plt.title("Feature importances")
plt.bar(range(x.shape[1]), clf.feature_importances_,
       color="r", align="center")
plt.xticks(range(x.shape[1]),[y for y in final_frame.columns if y != 'loan_status'])
plt.show()

In [63]:
## Predict y given test set
predictions = clf.predict(x_test)

In [64]:
## Take a look at the confusion matrix ([TN,FN],[FP,TP])
confusion_matrix(y_test,predictions)

array([[8463, 1473],
       [1369,  382]])

In [65]:
## Accuracy score
print '%0.2f' % precision_score(y_test, predictions)

0.21


In [66]:
## Recall score
print '%0.2f' % recall_score(y_test, predictions)

0.22


In [67]:
## Print classification report
print classification_report(y_test, predictions)

             precision    recall  f1-score   support

          0       0.86      0.85      0.86      9936
          1       0.21      0.22      0.21      1751

avg / total       0.76      0.76      0.76     11687



In [68]:
## Get data to plot ROC Curve
fp, tp, th = roc_curve(y_test, predictions)
roc_auc = auc(fp, tp)

In [69]:
## Plot ROC Curve
plt.title('ROC Curve')
plt.plot(fp, tp, 'b',
label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
plt.plot([0,1],[0,1],'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()