Using logistic regression on all variables availble:
* we delete sevaral column by the following criterias:
    * over 10% of data are nan or na
    * column with string value
    * meaningless column: analyticsmatchkey
* then delete the row with nan value in any column
* split data into training data and testing data
* run logistic regression using all variebles left and training data
* then test the result on test data

Result:
* we can see the accuracy for training data is 72.7% and drop to 66.1% in testing data which is reasonable
* we can imporve the result by choosing important features using PCA in the future
* if we select the most 50 features by f value, we get 
        ['LINKT001', 'LINKT004', 'AADM10', 'AADM12', 'AADM11', 'AADM13',
       'at103s', 'bc02s', 'bc36s', 'br02s', 'br36s', 'g051s', 'g212s', 'g215a',
       'g215b', 'g224b', 'g224c', 'g228s', 'g230s', 'g250b', 'g250c', 'g251b',
       'g251c', 'g305s', 'g960s', 're02s', 're03s', 're12s', 're24s', 's062s',
       's068b', 's071a', 's071b', 's073a', 's073b', 'trv07', 'trv08', 'trv10',
       'rvlr07', 'rvlr08', 'rvlr09', 'paymnt10', 'paymnt11', 'cv13', 'cv14',
       'CV_Auto', 'CV_NA', 'CV_BK', 'Vtg3', 'MSCORE']
* the accuracy becomes 69.2% after features selection

In [3]:
# read clean data
import pandas as pd
import numpy as np
data = pd.read_csv("clean_data.csv")

# seperate data into two buckets depends on states of performance
good = ['1-30 DPD', 'Current', 'Paid', 'Matured']
bad = ['Balance Owed', 'Assigned for Repossession', 'Recovered', '90+ DPD', 'Bankruptcy']
for item in good:
    data.loc[data.loc[:, 'PERFORMANCE'] == item, 'default'] = 0
for item in bad:
    data.loc[data.loc[:, 'PERFORMANCE'] == item, 'default'] = 1  
data = data.loc[((data.loc[:, 'default'] == 0) | (data.loc[:, 'default'] == 1)), :]
# generate output
data.to_csv("cleaned_data.csv", index=False)

# delete column if over 90% are the same or nan
row_num = data.shape[0]
for col_name in data.columns.tolist():
    non_nan = data.loc[:, col_name].count()
    if non_nan < row_num * 0.9:
        data = data.drop(col_name, 1)
# delete non meaningful column( for example: "analyticsmatchkey")
data = data.drop('analyticsmatchkey', 1)

print("Total size after cleaning ", data.shape)
print("good size = ", data.loc[data.loc[:, 'default'] == 0, :].shape)
print("bad size = ", data.loc[data.loc[:, 'default'] == 1, :].shape)

  interactivity=interactivity, compiler=compiler, result=result)


Total size after cleaning  (4972, 1070)
good size =  (3167, 1070)
bad size =  (1805, 1070)


In [4]:
# delete all column with string data type
for col_name in data.columns.tolist():
    if data.loc[:, col_name].dtype == 'object':
        data = data.drop(col_name, 1)
# clean all row with nan value
data.dropna(inplace=True)

In [5]:
# seperate for training data and testing data
train_data = data.sample(frac=0.7)
train_data.sort_index(inplace=True)
test_data = data[~data.index.isin(train_data.index)]
print("Number of training data = ", len(train_data))
print("Nuber of testing data = ", len(test_data))

Number of training data =  3174
Nuber of testing data =  1361


In [6]:
import matplotlib.pyplot as plt
from sklearn import linear_model, datasets
logreg = linear_model.LogisticRegression(C=1e5)
X = train_data.iloc[:, :-1]
Y = train_data.iloc[:, -1]
logreg.fit(X, Y)
# check the accuracy on the training set
print("The accuracy rate on training data is ", logreg.score(X, Y))
# test on test data
print("The accurary rate on test data is ", logreg.score(test_data.iloc[:, :-1], test_data.iloc[:, -1]))

The accuracy rate on training data is  0.77063642092
The accurary rate on test data is  0.649522409993


In [7]:
# examine the coefficients
logreg.coef_

array([[  1.16752943e-03,  -1.02174445e-04,   2.60673020e-04, ...,
         -1.80872196e-05,   3.71460324e-05,  -4.79075441e-06]])

In [8]:
# select the most important 20 factors
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
X_new = SelectKBest(f_classif, k=50)
X_new.fit_transform(X, Y)

Y = train_data.iloc[:, -1]
train_data = train_data.loc[:, X_new.get_support()]

In [9]:
train_data.columns

Index(['LINKT001', 'LINKT004', 'AADM10', 'AADM12', 'AADM11', 'AADM13',
       'at103s', 'bc02s', 'bc03s', 'bc36s', 'br02s', 'br36s', 'g051s', 'g212s',
       'g215a', 'g215b', 'g224c', 'g230s', 'g250c', 'g251c', 'g305s', 'g960s',
       're02s', 're03s', 're36s', 's061s', 's068a', 's068b', 's071a', 's071b',
       's073a', 's073b', 'trv01', 'trv07', 'trv08', 'trv10', 'rvlr07',
       'rvlr08', 'rvlr09', 'paymnt10', 'cv13', 'walshrs2', 'bcpmtnum',
       'CV_Auto', 'CV_NA', 'CV_BK', 'Vtg3', 'FICO08', 'FICO08_Auto', 'MSCORE'],
      dtype='object')

In [10]:
logreg3 = linear_model.LogisticRegression(C=1e5)
X = train_data
logreg3.fit(X, Y)
# check the accuracy on the training set
print("The accuracy rate on training data is ", logreg3.score(X, Y))

The accuracy rate on training data is  0.701953371141
