In [99]:
import pipeline as pl

In [100]:
y = 'SeriousDlqin2yrs'
cols = list(df.columns.values)
X = [x for x in cols if x != y]
features = X

# Step 1 & 2: Read and Explore Data

In [101]:
df = pl.read_data('credit-data.csv', 'csv')
pl.data_overview(df)

___ Summary Statistics ___
       SeriousDlqin2yrs  RevolvingUtilizationOfUnsecuredLines            age  \
count     150000.000000                         150000.000000  150000.000000   
mean           0.066840                              6.048438      52.295207   
std            0.249746                            249.755371      14.771866   
min            0.000000                              0.000000       0.000000   
25%            0.000000                              0.029867      41.000000   
50%            0.000000                              0.154181      52.000000   
75%            0.000000                              0.559046      63.000000   
max            1.000000                          50708.000000     109.000000   

             zipcode  NumberOfTime30-59DaysPastDueNotWorse      DebtRatio  \
count  150000.000000                         150000.000000  150000.000000   
mean    60648.810013                              0.421033     353.005076   
std        56.748197 

In [102]:
for col_name in X:
    pl.make_graph(df, col_name)

Plotting RevolvingUtilizationOfUnsecuredLines
Plotting age
Plotting zipcode
Plotting NumberOfTime30-59DaysPastDueNotWorse
Plotting DebtRatio
Plotting MonthlyIncome
Plotting NumberOfOpenCreditLinesAndLoans
Plotting NumberOfTimes90DaysLate
Plotting NumberRealEstateLoansOrLines
Plotting NumberOfTime60-89DaysPastDueNotWorse
Plotting NumberOfDependents


# Step 3: Pre-Process Data

In [103]:
pl.fill_null(df, 'MonthlyIncome', 'mean')
pl.fill_null(df, 'NumberOfDependents', 'median')
df.isnull().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
zipcode                                 0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

# Step 4: Generate Features/Predictors

In [104]:
df = pl.discretize_cont_var(df, 'NumberOfDependents', num_bins=3, cut_type='uniform', labels=['low','med','high'])
df = pl.binarize_categ_var(df, 'NumberOfDependents_discretize')

In [105]:
df = pl.discretize_cont_var(df, 'MonthlyIncome', num_bins=5, cut_type='quantile',\
                            labels=['low_level','med minus','med_level','med plus','high_level'])
df = pl.binarize_categ_var(df, 'MonthlyIncome_discretize')

In [106]:
df.head()

Unnamed: 0_level_0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,zipcode,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,...,NumberOfDependents_discretize,low,med,high,MonthlyIncome_discretize,low_level,med minus,med_level,med plus,high_level
PersonID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0.766127,45,60644,2,0.802982,9120.0,13,0,6,...,low,1,0,0,high_level,0,0,0,0,1
2,0,0.957151,40,60637,0,0.121876,2600.0,4,0,0,...,low,1,0,0,low_level,1,0,0,0,0
3,0,0.65818,38,60601,1,0.085113,3042.0,2,1,0,...,low,1,0,0,low_level,1,0,0,0,0
4,0,0.23381,30,60601,0,0.03605,3300.0,5,0,0,...,low,1,0,0,low_level,1,0,0,0,0
5,0,0.907239,49,60625,1,0.024926,63588.0,7,0,1,...,low,1,0,0,high_level,0,0,0,0,1


# Step 5: Build Classifier

In [122]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [123]:
X_train, X_test, y_train, y_test = pl.split_data(df, X, y, 0.2)

In [129]:
classifiers = [LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier(), \
               RandomForestClassifier(), GradientBopostingClassifier()]

In [130]:
for method in classifiers:
    pl.test_model(X_train, y_train, features, method)
    pl.predict_model(X_train, y_train, X_test, y, features, method)d

# Step 6: Evaluate Classifier

In [132]:
for method in classifiers:
    pl.eval_model(X_train, y_train, X_test, y_test, features, method)

Accuracy score is: 0.9329
Recall score is: 0.014392059553349877
Precision score is: 0.5178571428571429
Accuracy score is: 0.9304666666666667
Recall score is: 0.012903225806451613
Precision score is: 0.21138211382113822
Accuracy score is: 0.8955666666666666
Recall score is: 0.2823821339950372
Precision score is: 0.2522163120567376
Accuracy score is: 0.9332666666666667
Recall score is: 0.15285359801488835
Precision score is: 0.5107794361525705
Accuracy score is: 0.9365
Recall score is: 0.2044665012406948
Precision score is: 0.5770308123249299
