In [82]:
import pandas as pd

# 1. Read Data

In [101]:
def read_data (filename, filetype):
    '''
    Read data from file to pandas DataFrame
        filename, filetype: string
    '''
    if filetype == 'csv':
        return pd.read_csv(filename, index_col=0)
    if filetype == 'xls':
        return pd.read_excel(filename)
    if filetype == 'json':
        return pd.read_json(filename)


In [102]:
df = read_data('credit-data.csv', 'csv')

In [103]:
train, test = train_test_split(df, test_size = 0.2)

# 2. Explore Data

In [104]:
df.head()

Unnamed: 0_level_0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,zipcode,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
PersonID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,0.766127,45,60644,2,0.802982,9120.0,13,0,6,0,2.0
2,0,0.957151,40,60637,0,0.121876,2600.0,4,0,0,0,1.0
3,0,0.65818,38,60601,1,0.085113,3042.0,2,1,0,0,0.0
4,0,0.23381,30,60601,0,0.03605,3300.0,5,0,0,0,0.0
5,0,0.907239,49,60625,1,0.024926,63588.0,7,0,1,0,0.0


In [109]:
df.describe()

Unnamed: 0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,zipcode,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
count,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0,150000.0
mean,0.06684,6.048438,52.295207,60648.810013,0.421033,353.005076,6670.221,8.45276,0.265973,1.01824,0.240387,0.737413
std,0.249746,249.755371,14.771866,56.748197,4.192781,2037.818523,12880.45,5.145951,4.169304,1.129771,4.155179,1.107021
min,0.0,0.0,0.0,60601.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.029867,41.0,60625.0,0.0,0.175074,3903.0,5.0,0.0,0.0,0.0,0.0
50%,0.0,0.154181,52.0,60629.0,0.0,0.366508,6600.0,8.0,0.0,1.0,0.0,0.0
75%,0.0,0.559046,63.0,60644.0,0.0,0.868254,7400.0,11.0,0.0,2.0,0.0,1.0
max,1.0,50708.0,109.0,60804.0,98.0,329664.0,3008750.0,58.0,98.0,54.0,98.0,20.0


In [110]:
df.isnull().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
zipcode                                 0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

# 3. Pre-Process Data

In [111]:
def fill_null(df, col_name, fill_method):
    '''
    Fill null values of the specified columnn in the dataframe
        col_name: string
        fill_method: 'mean', 'median'
    '''
    if fill_method == 'mean':
        df[col_name].fillna(value=df[col_name].mean(), inplace=True)  
    if fill_method == 'median':
        df[col_name].fillna(value=df[col_name].median(), inplace=True)
    return df

In [112]:
fill_null(df, 'MonthlyIncome', 'mean')
fill_null(df, 'NumberOfDependents', 'median')
df.isnull().sum()

SeriousDlqin2yrs                        0
RevolvingUtilizationOfUnsecuredLines    0
age                                     0
zipcode                                 0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberOfTimes90DaysLate                 0
NumberRealEstateLoansOrLines            0
NumberOfTime60-89DaysPastDueNotWorse    0
NumberOfDependents                      0
dtype: int64

# 4. Generate Features/ Predictors

In [113]:
df

Unnamed: 0_level_0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,zipcode,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents
PersonID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,1,0.766127,45,60644,2,0.802982,9120.000000,13,0,6,0,2.0
2,0,0.957151,40,60637,0,0.121876,2600.000000,4,0,0,0,1.0
3,0,0.658180,38,60601,1,0.085113,3042.000000,2,1,0,0,0.0
4,0,0.233810,30,60601,0,0.036050,3300.000000,5,0,0,0,0.0
5,0,0.907239,49,60625,1,0.024926,63588.000000,7,0,1,0,0.0
6,0,0.213179,74,60629,0,0.375607,3500.000000,3,0,1,0,1.0
7,0,0.305682,57,60637,0,5710.000000,6670.221237,8,0,3,0,0.0
8,0,0.754464,39,60625,0,0.209940,3500.000000,8,0,0,0,0.0
9,0,0.116951,27,60804,0,46.000000,6670.221237,2,0,0,0,0.0
10,0,0.189169,57,60629,0,0.606291,23684.000000,9,0,4,0,2.0


In [114]:
def discretize_cont_var(df, col_name, num_bins, cut_type):
    '''
    Discretize a continuous variable of the DataFrame
        df: pandas DataFrame
        col_name, cut_type: string
        nnum_bins: integer
        labels: list of strings
    '''
    if cut_type == 'quantile':
        df[col_name +'_discretize'] = pd.qcut(df[col_name], num_bins)
    if cut_type == 'uniform':
        df[col_name +'_discretize'] = pd.cut(df[col_name], num_bins)
    return df       

In [115]:
df = discretize_cont_var(df, 'age', num_bins=4, cut_type='uniform')
# df.groupby(['age_discretize']).count()

In [116]:
def binarize_categ_var(df, col_name):
    '''
    Take a categorical variable and create binary/dummy variables from it
        df: pandas DataFrame
        col_name: string, categorical variable to binarize
    '''
    dummies = pd.get_dummies(df[col_name])
    df = df.join(dummies)
    return df

In [117]:
df = binarize_categ_var(df, 'age_discretize')

In [118]:
df

Unnamed: 0_level_0,SeriousDlqin2yrs,RevolvingUtilizationOfUnsecuredLines,age,zipcode,NumberOfTime30-59DaysPastDueNotWorse,DebtRatio,MonthlyIncome,NumberOfOpenCreditLinesAndLoans,NumberOfTimes90DaysLate,NumberRealEstateLoansOrLines,NumberOfTime60-89DaysPastDueNotWorse,NumberOfDependents,age_discretize,"(-0.109, 27.25]","(27.25, 54.5]","(54.5, 81.75]","(81.75, 109]"
PersonID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1,1,0.766127,45,60644,2,0.802982,9120.000000,13,0,6,0,2.0,"(27.25, 54.5]",0,1,0,0
2,0,0.957151,40,60637,0,0.121876,2600.000000,4,0,0,0,1.0,"(27.25, 54.5]",0,1,0,0
3,0,0.658180,38,60601,1,0.085113,3042.000000,2,1,0,0,0.0,"(27.25, 54.5]",0,1,0,0
4,0,0.233810,30,60601,0,0.036050,3300.000000,5,0,0,0,0.0,"(27.25, 54.5]",0,1,0,0
5,0,0.907239,49,60625,1,0.024926,63588.000000,7,0,1,0,0.0,"(27.25, 54.5]",0,1,0,0
6,0,0.213179,74,60629,0,0.375607,3500.000000,3,0,1,0,1.0,"(54.5, 81.75]",0,0,1,0
7,0,0.305682,57,60637,0,5710.000000,6670.221237,8,0,3,0,0.0,"(54.5, 81.75]",0,0,1,0
8,0,0.754464,39,60625,0,0.209940,3500.000000,8,0,0,0,0.0,"(27.25, 54.5]",0,1,0,0
9,0,0.116951,27,60804,0,46.000000,6670.221237,2,0,0,0,0.0,"(-0.109, 27.25]",1,0,0,0
10,0,0.189169,57,60629,0,0.606291,23684.000000,9,0,4,0,2.0,"(54.5, 81.75]",0,0,1,0


# 5. Build Classifier

In [119]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

In [120]:
def split_data(df, X, y, test_size):
    '''
    Split data into training and test sets
        df: Pandas DataFrame
        X: array of string, features
        y: string, outcome variable
        test_size: proportion of the total data used as test dataset

    '''
    X_train, X_test, y_train, y_test  = train_test_split(df[X], df[y], test_size=test_size)
    return X_train, X_test, y_train, y_test    

In [121]:
y = 'SeriousDlqin2yrs'
cols = list(df.columns.values)
X = [x for x in cols if x != y]

In [122]:
X_train, X_test, y_train, y_test = split_data(df, X, y, 0.2)

In [123]:
def test_model(X_train, y_train, features, method):
    '''
    Build classifiers chosen by the user
        X_train, y_train: Pandas DataFrame
        features: list of strings, variables we care about
        method: LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier()
    '''
    X = X_train[features]
    y = y_train
    return method.fit(X, y)

In [124]:
df.columns

Index(['SeriousDlqin2yrs', 'RevolvingUtilizationOfUnsecuredLines', 'age',
       'zipcode', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio',
       'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
       'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
       'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
       'age_discretize', '(-0.109, 27.25]', '(27.25, 54.5]', '(54.5, 81.75]',
       '(81.75, 109]'],
      dtype='object')

In [127]:
features = ['RevolvingUtilizationOfUnsecuredLines', 'age',
       'zipcode', 'NumberOfTime30-59DaysPastDueNotWorse', 'DebtRatio',
       'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
       'NumberOfTimes90DaysLate', 'NumberRealEstateLoansOrLines',
       'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents',
        '(-0.109, 27.25]', '(27.25, 54.5]', '(54.5, 81.75]',
       '(81.75, 109]']

In [128]:
test_model(X_train, y_train, features, DecisionTreeClassifier())

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [131]:
def predict_model(X_train, y_train, X_test, y, features, method):
    '''
    Predict outcomes for test data based on the chosen classifier, and write to csv file
        X_test: test pandas DataFrame
        y: string, outcome variable name, 'SeriousDlqin2yrs'
        features: list of strings, variables we care about
        method: LogisticRegression(), KNeighborsClassifier(), DecisionTreeClassifier()
    '''
    method.fit(X_train[features], y_train)
    X = X_test[features]
    y_pred = method.predict(X)
    df_pred = X_test
    df_pred[y] = y_pred
    df_pred.to_csv('predictions.csv', header=True)
    

In [133]:
predict_model(X_train, y_train, X_test, y, features, DecisionTreeClassifier())

# 6. Evaluate Classifier

In [134]:
from sklearn import metrics

In [141]:
def eval_model(X_train, y_train, X_test, y_test, features, method):
    method.fit(X_train[features], y_train)
    X = X_test[features]
    y_pred = method.predict(X)
    accuracy = metrics.accuracy_score(y_test, y_pred)
    recall = metrics.recall_score(y_test, y_pred)
    precision = metrics.precision_score(y_test, y_pred)
    print('Accuracy score is: {}'.format(accuracy))
    print('Recall score is: {}'.format(recall))
    print('Precision score is: {}'.format(precision))

In [142]:
eval_model(X_train, y_train, X_test, y_test, features, LogisticRegression())

Accuracy score is: 0.9317
Recall score is: 0.0078125
Precision score is: 0.48484848484848486
