In [2]:
import pandas as pd
df=pd.read_csv('train_indessa.csv')

In [3]:
df.drop(['mths_since_last_delinq','mths_since_last_major_derog','mths_since_last_record','desc',
         'verification_status_joint','batch_enrolled','sub_grade','title','zip_code','addr_state',
        'emp_title'],axis=1,inplace=True)

In [4]:
df['emp_length'].replace('n/a', '0', inplace=True)
df['emp_length'].replace(to_replace='\+ years', value='', regex=True, inplace=True)
df['emp_length'].replace(to_replace=' years', value='', regex=True, inplace=True)
df['emp_length'].replace(to_replace='< 1 year', value='0', regex=True, inplace=True)
df['emp_length'].replace(to_replace=' year', value='', regex=True, inplace=True)

df['emp_length'].fillna(df.emp_length.median(),inplace=True)

df['emp_length'] = pd.to_numeric(df['emp_length'], errors='coerce')

In [5]:
df['last_week_pay'] = df['last_week_pay'].str.extract('(\d+)', expand=False)
df['last_week_pay'].fillna(df['last_week_pay'].median(), inplace=True)
df['last_week_pay'] = pd.to_numeric(df['last_week_pay'], errors='coerce')

In [6]:
cols = ['loan_amnt', 'funded_amnt', 'int_rate',  'annual_inc', 'dti',  'open_acc', 'revol_bal', 'revol_util', 'total_acc',
        'total_rec_int','tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']
for col in cols:
    print('Imputation with Median: %s' % (col))
    df[col].fillna(df[col].median(), inplace=True)

Imputation with Median: loan_amnt
Imputation with Median: funded_amnt
Imputation with Median: int_rate
Imputation with Median: annual_inc
Imputation with Median: dti
Imputation with Median: open_acc
Imputation with Median: revol_bal
Imputation with Median: revol_util
Imputation with Median: total_acc
Imputation with Median: total_rec_int
Imputation with Median: tot_coll_amt
Imputation with Median: tot_cur_bal
Imputation with Median: total_rev_hi_lim


In [7]:
cols = ['acc_now_delinq', 'collections_12_mths_ex_med',
       'delinq_2yrs','inq_last_6mths','pub_rec']
for col in cols:
    print('Imputation with Zero: %s' % (col))
    df[col].fillna(0, inplace=True)

Imputation with Zero: acc_now_delinq
Imputation with Zero: collections_12_mths_ex_med
Imputation with Zero: delinq_2yrs
Imputation with Zero: inq_last_6mths
Imputation with Zero: pub_rec


In [8]:
df['term'].replace(to_replace=' months', value='', regex=True, inplace=True)
df['term'] = pd.to_numeric(df['term'], errors='coerce')

In [9]:
import numpy as np
# Encode Label for Classifier
from sklearn.preprocessing import LabelEncoder
le1 = {}
le=['home_ownership','application_type','initial_list_status','pymnt_plan','purpose','initial_list_status','grade','verification_status']

for col in le:
    le1[col] = LabelEncoder()
    df[col] = le1[col].fit_transform(df[col])
    le1[col].classes_ = np.append(le1[col].classes_, 'other')
    
    print('Encoded: ', col)

Encoded:  home_ownership
Encoded:  application_type
Encoded:  initial_list_status
Encoded:  pymnt_plan
Encoded:  purpose
Encoded:  initial_list_status
Encoded:  grade
Encoded:  verification_status


In [10]:
# Interest paid so far
df['int_paid'] = df['total_rec_int'] + df['total_rec_late_fee']

# Total number of available/unused 'credit lines'
df['avl_lines'] = df['total_acc'] - df['open_acc']

df['sanctinoned'] = (df['funded_amnt'] + df['funded_amnt_inv']).div(2)

df['total_repayment_progress'] = ((df['last_week_pay']/(df['term']/12*52+1))*100) + ((df['recoveries']/df['funded_amnt_inv']) * 100)

df.insert(0, 'payment_completion', (df['last_week_pay']/(df['term']/12*52+1))*100)

df['loan_to_income'] = df['annual_inc']/df['funded_amnt_inv']


In [11]:
df.replace([np.inf, -np.inf], 0, inplace=True)

In [12]:
df.isnull().sum()

payment_completion              0
member_id                       0
loan_amnt                       0
funded_amnt                     0
funded_amnt_inv                 0
term                            0
int_rate                        0
grade                           0
emp_length                      0
home_ownership                  0
annual_inc                      0
verification_status             0
pymnt_plan                      0
purpose                         0
dti                             0
delinq_2yrs                     0
inq_last_6mths                  0
open_acc                        0
pub_rec                         0
revol_bal                       0
revol_util                      0
total_acc                       0
initial_list_status             0
total_rec_int                   0
total_rec_late_fee              0
recoveries                      0
collection_recovery_fee         0
collections_12_mths_ex_med      0
application_type                0
last_week_pay 

In [13]:
df.fillna(0,inplace=True)

In [14]:
df.isnull().sum()

payment_completion            0
member_id                     0
loan_amnt                     0
funded_amnt                   0
funded_amnt_inv               0
term                          0
int_rate                      0
grade                         0
emp_length                    0
home_ownership                0
annual_inc                    0
verification_status           0
pymnt_plan                    0
purpose                       0
dti                           0
delinq_2yrs                   0
inq_last_6mths                0
open_acc                      0
pub_rec                       0
revol_bal                     0
revol_util                    0
total_acc                     0
initial_list_status           0
total_rec_int                 0
total_rec_late_fee            0
recoveries                    0
collection_recovery_fee       0
collections_12_mths_ex_med    0
application_type              0
last_week_pay                 0
acc_now_delinq                0
tot_coll

In [15]:
from sklearn.model_selection import train_test_split
X=df.drop(['loan_status','member_id'],axis=1)
y=df[['loan_status']]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.20,random_state =123)

In [16]:
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing dataabs
X_test_norm = norm.transform(X_test)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score

In [18]:
rf = RandomForestClassifier(n_estimators=180,min_samples_leaf=3,max_features=0.5, verbose=5, n_jobs=-1)
rf.fit(X_train_norm, y_train)

preds = rf.predict(X_test_norm)
fpr1, tpr1, thr1 = roc_curve(y_test['loan_status'], preds)
auc1 = roc_auc_score(y_test['loan_status'], preds)

print(auc1)
rf.score(X_test_norm, y_test)

  
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.


building tree 1 of 180
building tree 2 of 180
building tree 3 of 180
building tree 4 of 180
building tree 5 of 180
building tree 6 of 180
building tree 7 of 180
building tree 8 of 180
building tree 9 of 180
building tree 10 of 180
building tree 11 of 180
building tree 12 of 180
building tree 13 of 180
building tree 14 of 180


[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   36.9s


building tree 15 of 180
building tree 16 of 180
building tree 17 of 180
building tree 18 of 180
building tree 19 of 180
building tree 20 of 180
building tree 21 of 180
building tree 22 of 180
building tree 23 of 180
building tree 24 of 180
building tree 25 of 180
building tree 26 of 180
building tree 27 of 180
building tree 28 of 180
building tree 29 of 180
building tree 30 of 180
building tree 31 of 180
building tree 32 of 180
building tree 33 of 180
building tree 34 of 180
building tree 35 of 180
building tree 36 of 180
building tree 37 of 180
building tree 38 of 180
building tree 39 of 180
building tree 40 of 180
building tree 41 of 180
building tree 42 of 180
building tree 43 of 180
building tree 44 of 180
building tree 45 of 180
building tree 46 of 180
building tree 47 of 180
building tree 48 of 180
building tree 49 of 180
building tree 50 of 180
building tree 51 of 180
building tree 52 of 180
building tree 53 of 180
building tree 54 of 180
building tree 55 of 180
building tree 56

[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:  3.4min


building tree 68 of 180
building tree 69 of 180
building tree 70 of 180
building tree 71 of 180
building tree 72 of 180
building tree 73 of 180
building tree 74 of 180
building tree 75 of 180
building tree 76 of 180
building tree 77 of 180
building tree 78 of 180
building tree 79 of 180
building tree 80 of 180
building tree 81 of 180
building tree 82 of 180
building tree 83 of 180
building tree 84 of 180
building tree 85 of 180
building tree 86 of 180
building tree 87 of 180
building tree 88 of 180
building tree 89 of 180
building tree 90 of 180
building tree 91 of 180
building tree 92 of 180
building tree 93 of 180
building tree 94 of 180
building tree 95 of 180
building tree 96 of 180
building tree 97 of 180
building tree 98 of 180
building tree 99 of 180
building tree 100 of 180
building tree 101 of 180
building tree 102 of 180
building tree 103 of 180
building tree 104 of 180
building tree 105 of 180
building tree 106 of 180
building tree 107 of 180
building tree 108 of 180
buildin

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  7.3min


building tree 158 of 180
building tree 159 of 180
building tree 160 of 180
building tree 161 of 180
building tree 162 of 180
building tree 163 of 180
building tree 164 of 180
building tree 165 of 180
building tree 166 of 180
building tree 167 of 180
building tree 168 of 180
building tree 169 of 180
building tree 170 of 180
building tree 171 of 180
building tree 172 of 180
building tree 173 of 180
building tree 174 of 180
building tree 175 of 180
building tree 176 of 180
building tree 177 of 180
building tree 178 of 180
building tree 179 of 180
building tree 180 of 180


[Parallel(n_jobs=-1)]: Done 180 out of 180 | elapsed:  8.4min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    3.5s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.


0.744598815241591


[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:    0.2s
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed:    3.0s
[Parallel(n_jobs=4)]: Done 180 out of 180 | elapsed:    3.5s finished


0.8720582987434968

In [19]:
fi = list(zip(X_train.columns.values, rf.feature_importances_))
fi = sorted(fi, key=lambda x: -x[1])
qw=pd.DataFrame(fi, columns=["Feature","Importance"])

In [20]:
feat=qw['Feature'].tolist()

In [21]:
df1=df[feat[:20]]
X=df1
y=df[['loan_status']]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.30,random_state =123)

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing dataabs
X_test_norm = norm.transform(X_test)

In [22]:
from sklearn import svm
svc_clf = svm.SVC(random_state = 7)
svc_clf.fit(X_train_norm, y_train)
svc_score = svc_clf.score(X_test_norm, y_test)
svc_score

  return f(**kwargs)


0.7966868884172568

In [23]:
preds

array([0, 0, 0, ..., 1, 0, 0])

In [22]:
X_train.shape

(372699, 20)

In [24]:
import tensorflow as tf
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(20,)),
    keras.layers.Dense(100, activation=tf.nn.relu),
	keras.layers.Dense(100, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train_norm, y_train, epochs=50, batch_size=64)

test_loss, test_acc = model.evaluate(X_test_norm, y_test)
print('Test accuracy:', test_acc)

Train on 372699 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 0.86394453


In [25]:
feat_40_train=['payment_completion',
 'int_rate',
 'tot_cur_bal',
 'last_week_pay',
 'total_rev_hi_lim',
 'dti',
 'revol_util',
 'initial_list_status',
 'revol_bal',
 'annual_inc',
 'int_paid',
 'total_rec_int',
 'grade',
 'avl_lines',
 'total_acc',
 'open_acc',
 'recoveries',
 'emp_length',
 'funded_amnt_inv',
 'sanctinoned',
 'loan_amnt',
 'funded_amnt',
 'verification_status',
 'inq_last_6mths',
 'term',
 'purpose',
 'collection_recovery_fee',
 'tot_coll_amt',
 'home_ownership',
 'delinq_2yrs',
 'pub_rec',
 'total_rec_late_fee',
 'collections_12_mths_ex_med',
 'loan_to_income',
 'acc_now_delinq',
 'pymnt_plan',
 'total_repayment_progress',
 'application_type',
'loan_status']

In [26]:
feat_40_test=['payment_completion',
 'int_rate',
 'tot_cur_bal',
 'last_week_pay',
 'total_rev_hi_lim',
 'dti',
 'revol_util',
 'initial_list_status',
 'revol_bal',
 'annual_inc',
 'int_paid',
 'total_rec_int',
 'grade',
 'avl_lines',
 'total_acc',
 'open_acc',
 'recoveries',
 'emp_length',
 'funded_amnt_inv',
 'sanctinoned',
 'loan_amnt',
 'funded_amnt',
 'verification_status',
 'inq_last_6mths',
 'term',
 'purpose',
 'collection_recovery_fee',
 'tot_coll_amt',
 'home_ownership',
 'delinq_2yrs',
 'pub_rec',
 'total_rec_late_fee',
 'collections_12_mths_ex_med',
 'loan_to_income',
 'acc_now_delinq',
 'pymnt_plan',
 'total_repayment_progress',
 'application_type']

In [27]:
df1=df[feat_40_train]
X=df1.drop(['loan_status'],axis=1)
y=df[['loan_status']]
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=.30,random_state =123)

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing dataabs
X_test_norm = norm.transform(X_test)

In [29]:
import tensorflow as tf
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(38,)),
    keras.layers.Dense(100, activation=tf.nn.relu),
	keras.layers.Dense(100, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

model.fit(X_train_norm, y_train, epochs=50, batch_size=64)

test_loss, test_acc = model.evaluate(X_test_norm, y_test)
print('Test accuracy:', test_acc)

Train on 372699 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Test accuracy: 0.85355824


In [None]:
import tensorflow as tf
from tensorflow import keras

model = keras.Sequential([
    keras.layers.Flatten(input_shape=(38,)),
    keras.layers.Dense(100, activation=tf.nn.relu),
	keras.layers.Dense(100, activation=tf.nn.relu),
    keras.layers.Dense(1, activation=tf.nn.sigmoid),
])

model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['AUC'])

model.fit(X_train_norm, y_train, epochs=50, batch_size=64)

test_loss, test_acc = model.evaluate(X_test_norm, y_test)
print('Test accuracy:', test_acc)

Train on 372699 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

In [None]:
from sklearn.metrics import roc_auc_score
y_pred = model.predict_proba(X_test_norm)
roc_auc_score(y_test, y_pred)