In [18]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.preprocessing import LabelEncoder
df_train=pd.read_csv('train_indessa.csv')
df_test=pd.read_csv('test_indessa.csv')

In [19]:
df_train.drop(['mths_since_last_delinq','mths_since_last_major_derog','mths_since_last_record','desc',
         'verification_status_joint','batch_enrolled','sub_grade','title','zip_code','addr_state',
        'emp_title'],axis=1,inplace=True)
df_test.drop(['mths_since_last_delinq','mths_since_last_major_derog','mths_since_last_record','desc',
         'verification_status_joint','batch_enrolled','sub_grade','title','zip_code','addr_state',
        'emp_title'],axis=1,inplace=True)

In [20]:
df_train['emp_length'].replace('n/a', '0', inplace=True)
df_train['emp_length'].replace(to_replace='\+ years', value='', regex=True, inplace=True)
df_train['emp_length'].replace(to_replace=' years', value='', regex=True, inplace=True)
df_train['emp_length'].replace(to_replace='< 1 year', value='0', regex=True, inplace=True)
df_train['emp_length'].replace(to_replace=' year', value='', regex=True, inplace=True)
df_train['emp_length'].fillna(df_train.emp_length.median(),inplace=True)
df_train['emp_length'] = pd.to_numeric(df_train['emp_length'], errors='coerce')

df_test['emp_length'].replace('n/a', '0', inplace=True)
df_test['emp_length'].replace(to_replace='\+ years', value='', regex=True, inplace=True)
df_test['emp_length'].replace(to_replace=' years', value='', regex=True, inplace=True)
df_test['emp_length'].replace(to_replace='< 1 year', value='0', regex=True, inplace=True)
df_test['emp_length'].replace(to_replace=' year', value='', regex=True, inplace=True)
df_test['emp_length'].fillna(df_test.emp_length.median(),inplace=True)
df_test['emp_length'] = pd.to_numeric(df_test['emp_length'], errors='coerce')

In [21]:
df_train['last_week_pay'] = df_train['last_week_pay'].str.extract('(\d+)', expand=False)
df_train['last_week_pay'].fillna(df_train['last_week_pay'].median(), inplace=True)
df_train['last_week_pay'] = pd.to_numeric(df_train['last_week_pay'], errors='coerce')

df_test['last_week_pay'] = df_test['last_week_pay'].str.extract('(\d+)', expand=False)
df_test['last_week_pay'].fillna(df_test['last_week_pay'].median(), inplace=True)
df_test['last_week_pay'] = pd.to_numeric(df_test['last_week_pay'], errors='coerce')

In [22]:
cols = ['loan_amnt', 'funded_amnt', 'int_rate',  'annual_inc', 'dti',  'open_acc', 'revol_bal', 'revol_util', 'total_acc',
        'total_rec_int','tot_coll_amt', 'tot_cur_bal', 'total_rev_hi_lim']
for col in cols:
    print('Train:Imputation with Median: %s' % (col))
    df_train[col].fillna(df_train[col].median(), inplace=True)
    
for col in cols:
    print('Test: Imputation with Median: %s' % (col))
    df_test[col].fillna(df_test[col].median(), inplace=True)

Train:Imputation with Median: loan_amnt
Train:Imputation with Median: funded_amnt
Train:Imputation with Median: int_rate
Train:Imputation with Median: annual_inc
Train:Imputation with Median: dti
Train:Imputation with Median: open_acc
Train:Imputation with Median: revol_bal
Train:Imputation with Median: revol_util
Train:Imputation with Median: total_acc
Train:Imputation with Median: total_rec_int
Train:Imputation with Median: tot_coll_amt
Train:Imputation with Median: tot_cur_bal
Train:Imputation with Median: total_rev_hi_lim
Test: Imputation with Median: loan_amnt
Test: Imputation with Median: funded_amnt
Test: Imputation with Median: int_rate
Test: Imputation with Median: annual_inc
Test: Imputation with Median: dti
Test: Imputation with Median: open_acc
Test: Imputation with Median: revol_bal
Test: Imputation with Median: revol_util
Test: Imputation with Median: total_acc
Test: Imputation with Median: total_rec_int
Test: Imputation with Median: tot_coll_amt
Test: Imputation with Med

In [23]:
cols = ['acc_now_delinq', 'collections_12_mths_ex_med',
       'delinq_2yrs','inq_last_6mths','pub_rec']
for col in cols:
    print('Train: Imputation with Zero: %s' % (col))
    df_train[col].fillna(0, inplace=True)
    
for col in cols:
    print('Test: Imputation with Zero: %s' % (col))
    df_test[col].fillna(0, inplace=True)

Train: Imputation with Zero: acc_now_delinq
Train: Imputation with Zero: collections_12_mths_ex_med
Train: Imputation with Zero: delinq_2yrs
Train: Imputation with Zero: inq_last_6mths
Train: Imputation with Zero: pub_rec
Test: Imputation with Zero: acc_now_delinq
Test: Imputation with Zero: collections_12_mths_ex_med
Test: Imputation with Zero: delinq_2yrs
Test: Imputation with Zero: inq_last_6mths
Test: Imputation with Zero: pub_rec


In [24]:
df_train['term'].replace(to_replace=' months', value='', regex=True, inplace=True)
df_train['term'] = pd.to_numeric(df_train['term'], errors='coerce')

df_test['term'].replace(to_replace=' months', value='', regex=True, inplace=True)
df_test['term'] = pd.to_numeric(df_test['term'], errors='coerce')

In [25]:
le1 = {}
le=['home_ownership','application_type','initial_list_status','pymnt_plan','purpose','initial_list_status','grade','verification_status']

for col in le:
    le1[col] = LabelEncoder()
    df_train[col] = le1[col].fit_transform(df_train[col])
    le1[col].classes_ = np.append(le1[col].classes_, 'other')
    
    print('Train: Encoded: ', col)
    
for col in le:
    le1[col] = LabelEncoder()
    df_test[col] = le1[col].fit_transform(df_test[col])
    le1[col].classes_ = np.append(le1[col].classes_, 'other')
    
    print('Test :Encoded: ', col)

Train: Encoded:  home_ownership
Train: Encoded:  application_type
Train: Encoded:  initial_list_status
Train: Encoded:  pymnt_plan
Train: Encoded:  purpose
Train: Encoded:  initial_list_status
Train: Encoded:  grade
Train: Encoded:  verification_status
Test :Encoded:  home_ownership
Test :Encoded:  application_type
Test :Encoded:  initial_list_status
Test :Encoded:  pymnt_plan
Test :Encoded:  purpose
Test :Encoded:  initial_list_status
Test :Encoded:  grade
Test :Encoded:  verification_status


In [26]:
# Interest paid so far
df_train['int_paid'] = df_train['total_rec_int'] + df_train['total_rec_late_fee']

# Total number of available/unused 'credit lines'
df_train['avl_lines'] = df_train['total_acc'] - df_train['open_acc']

df_train['sanctinoned'] = (df_train['funded_amnt'] + df_train['funded_amnt_inv']).div(2)

df_train['total_repayment_progress'] = ((df_train['last_week_pay']/(df_train['term']/12*52+1))*100) + ((df_train['recoveries']/df_train['funded_amnt_inv']) * 100)

df_train.insert(0, 'payment_completion', (df_train['last_week_pay']/(df_train['term']/12*52+1))*100)

df_train['loan_to_income'] = df_train['annual_inc']/df_train['funded_amnt_inv']



# Interest paid so far
df_test['int_paid'] = df_test['total_rec_int'] + df_test['total_rec_late_fee']

# Total number of available/unused 'credit lines'
df_test['avl_lines'] = df_test['total_acc'] - df_test['open_acc']

df_test['sanctinoned'] = (df_test['funded_amnt'] + df_test['funded_amnt_inv']).div(2)

df_test['total_repayment_progress'] = ((df_test['last_week_pay']/(df_test['term']/12*52+1))*100) + ((df_test['recoveries']/df_test['funded_amnt_inv']) * 100)

df_test.insert(0, 'payment_completion', (df_test['last_week_pay']/(df_test['term']/12*52+1))*100)

df_test['loan_to_income'] = df_test['annual_inc']/df_test['funded_amnt_inv']

In [27]:
df_train.replace([np.inf, -np.inf], 0, inplace=True)

df_test.replace([np.inf, -np.inf], 0, inplace=True)

In [28]:
df_train.fillna(0,inplace=True)

df_test.fillna(0,inplace=True)

In [29]:
feat=['payment_completion',
 'int_rate',
 'tot_cur_bal',
 'last_week_pay',
 'total_rev_hi_lim',
 'dti',
 'revol_util',
 'initial_list_status',
 'revol_bal',
 'annual_inc',
 'int_paid',
 'total_rec_int',
 'grade',
 'avl_lines',
 'total_acc',
 'open_acc',
 'recoveries',
 'emp_length',
 'funded_amnt_inv',
 'sanctinoned',
  'loan_status']

In [30]:
feat_test=['payment_completion',
 'int_rate',
 'tot_cur_bal',
 'last_week_pay',
 'total_rev_hi_lim',
 'dti',
 'revol_util',
 'initial_list_status',
 'revol_bal',
 'annual_inc',
 'int_paid',
 'total_rec_int',
 'grade',
 'avl_lines',
 'total_acc',
 'open_acc',
 'recoveries',
 'emp_length',
 'funded_amnt_inv',
 'sanctinoned']

In [31]:
from sklearn.model_selection import train_test_split
df_train=df_train[feat]
df_test=df_test[feat_test]
X=df_train.drop(['loan_status'],axis=1)
y=df_train[['loan_status']]
X_train, y_train,X_test= X,y,df_test

In [32]:
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing dataabs
X_test_norm = norm.transform(X_test)

In [34]:
import xgboost as xgb
gbm = xgb.XGBClassifier(
    #learning_rate = 0.02,
     n_estimators= 2000,
     max_depth= 4,
     min_child_weight= 2,
     #gamma=1,
     gamma=0.9,                        
     subsample=0.8,
     colsample_bytree=0.8,
     objective= 'binary:logistic',
     nthread= -1,
     scale_pos_weight=1).fit(X_train_norm, y_train)
xgb_predictions = gbm.predict(X_test_norm)
#preds = cbc_clf.predict(X_test_norm)
#fpr1, tpr1, thr1 = roc_curve(y_test['loan_status'], xgb_predictions)
#auc1 = roc_auc_score(y_test['loan_status'], xgb_predictions)
#print(auc1)

  return f(**kwargs)


In [37]:

from collections import Counterxgb_predictions

array([0, 0, 0, ..., 0, 0, 0])

In [35]:
sub=pd.read_csv('sample_submission.csv')

In [36]:
sub.head()

Unnamed: 0,member_id,loan_status
0,11937648,0.5
1,38983318,0.5
2,27999917,0.5
3,61514932,0.5
4,59622821,0.5
