In [27]:
import warnings
warnings.filterwarnings('ignore')
import os
import pandas as pd
import numpy as np
import seaborn as sns
import scipy
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline, make_union, FeatureUnion
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelBinarizer, FunctionTransformer,LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, auc
from sklearn_pandas import DataFrameMapper, gen_features
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
current_palette = sns.color_palette('colorblind')
import category_encoders as ce

In [28]:
def check_path(path):
    '''
    This function sets the current path of the working Directory
    '''
    if os.getcwd() == path:
        print("Current Directory is ",path)
        return
    elif os.path.exists(path):
        os.chdir('C:/Users/shuklas/Downloads/Participants_Data_PLD')
    else :
        os.mkdir('C:/Users/shuklas/Downloads/Participants_Data_PLD')
    print("Current Directory Changed to :-")
    return os.getcwd()

In [29]:
check_path('C:\\Users\\shuklas\\Downloads\\Participants_Data_PLD')

Current Directory is  C:\Users\shuklas\Downloads\Participants_Data_PLD


In [49]:
# Read the Data from the training data set and display the fist 5 rows
df = pd.read_csv("train.csv")
df.columns = df.columns.str.replace(' ','_')
df.columns = df.columns.map(str.lower)
df.drop(["payment_plan","accounts_delinquent","id"],inplace=True,axis=1)
df.head()

Unnamed: 0,loan_amount,funded_amount,funded_amount_investor,term,batch_enrolled,interest_rate,grade,sub_grade,employment_duration,home_ownership,...,total_received_late_fee,recoveries,collection_recovery_fee,collection_12_months_medical,application_type,last_week_pay,total_collection_amount,total_current_balance,total_revolving_credit_limit,loan_status
0,10000,32236,12329.36286,59,BAT2522922,11.135007,B,C4,MORTGAGE,176346.6267,...,0.102055,2.498291,0.793724,0,INDIVIDUAL,49,31,311301,6619,0
1,3609,11940,12191.99692,59,BAT1586599,12.237563,C,D3,RENT,39833.921,...,0.036181,2.377215,0.974821,0,INDIVIDUAL,109,53,182610,20885,0
2,28276,9311,21603.22455,59,BAT2136391,12.545884,F,D4,MORTGAGE,91506.69105,...,18.77866,4.316277,1.020075,0,INDIVIDUAL,66,34,89801,26155,0
3,11170,6954,17877.15585,59,BAT2428731,16.731201,C,C3,MORTGAGE,108286.5759,...,0.044131,0.10702,0.749971,0,INDIVIDUAL,39,40,9189,60214,0
4,16890,13226,13539.92667,59,BAT5341619,15.0083,C,D4,MORTGAGE,44234.82545,...,19.306646,1294.818751,0.368953,0,INDIVIDUAL,18,430,126029,22579,0


In [31]:
# Reduce Loan Titles, by merging them under blanket categories: based on Domain Understanding
loan_dict = {'debt consolidation':'debt_consolidation',
'credit card refinancing': 'debt_consolidation',
'home improvement':'home_loan',
'credit consolidation':'debt_consolidation',
'green loan':'green_loan',
'other':'personal',
'moving and relocation':'personal',
'credit cards':'debt_consolidation',
'medical expenses':'medical',
'refinance':'debt_consolidation',
'credit card consolidation':'debt_consolidation',
'lending club':'personal',
'debt consolidation loan':'debt_consolidation',
'major purchase':'personal',
'vacation':'personal',
'business':'business', 
'credit card payoff':'debt_consolidation',
'credit card':'debt_consolidation',
'credit card refi':'debt_consolidation',
'personal loan':'personal',
'cc refi':'debt_consolidation',
'consolidate':'debt_consolidation',
'medical':'medical',
'loan 1':'personal',
'consolidation':'debt_consolidation',
'card consolidation':'debt_consolidation',
'car financing':'car_loan',
'debt':'personal',
'home buying':'home_loan',
'freedom':'personal',
'consolidated':'debt_consolidation',
'get out of debt':'personal', 
'consolidation loan':'debt_consolidation',
'dept consolidation':'debt_consolidation',
'personal':'personal',
'cards':'debt_consolidation',
'bathroom':'home_loan', 
'refi':'debt_consolidation',
'credit card loan':'debt_consolidation',
'credit card debt':'debt_consolidation',
'house':'home_loan',
'debt consolidation 2013':'debt_consolidation',
'debt loan':'debt_consolidation',
'cc refinance':'debt_consolidation', 
'home':'home_loan',
'cc consolidation':'debt_consolidation',
'credit card refinance':'debt_consolidation',
'credit loan':'personal', 
'payoff':'personal',
'bill consolidation':'debt_consolidation', 
'credit card paydown':'debt_consolidation',
'credit card pay off':'debt_consolidation',
'get debt free':'personal',
'myloan':'personal',
'credit pay off':'debt_consolidation', 
'my loan':'personal',
'loan':'personal',
'bill payoff':'personal',
'cc-refinance':'debt_consolidation',
'debt reduction':'personal', 
'medical loan':'medical',
'wedding loan':'personal', 
'credit':'debt_consolidation', 
'pay off bills':'personal', 
'refinance loan':'debt_consolidation',
'debt payoff':'personal',
'car loan':'car_loan',
'pay off':'personal',
'pool':'home_loan',
'credit payoff':'personal',
'credit card refinance loan':'debt_consolidation',
'cc loan':'debt_consolidation',
'debt free':'personal', 
'conso':'debt_consolidation',
'home improvement loan':'home_loan',
'loan consolidation':'debt_consolidation',
'lending loan':'personal',
'relief':'medical',
'cc':'debt_consolidation',
'loan1':'personal',
'getting ahead':'personal', 
'home loan':'home_loan',
'bills':'personal'}

In [50]:
# Map the loan titles with loan dict
df.loan_title = df.loan_title.map(str.lower)
df.loan_title = df.loan_title.map(loan_dict)

In [51]:
# Display the Default Percentage for all Loan Titles
df['loan_status'].groupby(df.loan_title).value_counts(normalize = True).rename('prop').reset_index()

Unnamed: 0,loan_title,loan_status,prop
0,business,0,0.923497
1,business,1,0.076503
2,car_loan,0,0.868056
3,car_loan,1,0.131944
4,debt_consolidation,0,0.907788
5,debt_consolidation,1,0.092212
6,green_loan,0,0.911111
7,green_loan,1,0.088889
8,home_loan,0,0.903121
9,home_loan,1,0.096879


In [52]:
def lastfill_outliers(df):
    '''
    This function fills the outliers with Q3+1.5*IQR values
    '''
    df.loc[df["total_received_late_fee"]>1]['total_received_late_fee']=1
    df.loc[df["debit_to_income"]>1]['debit_to_income']=1
    df.loc[df["recoveries"]>10]['recoveries']=10
    df.loc[df["collection_recovery_fee"]>2]["collection_recovery_fee"]=2
    df.loc[df["total_collection_amount"]>80]["total_collection_amount"]=80

In [58]:
def financed(df):
    '''
    If loan amount is less than funded amount(investor), the application is underfinanced
    '''
    under_financed = 1 if df['loan_amount'] < df["funded_amount_investor"]  else 0
    return under_financed

under_financed = FunctionTransformer(lambda x: x.apply(financed, axis=1).to_numpy().reshape(-1,1))


def preprocess(df):
    '''
    Group together similar columns for PCA
    '''
    recoveries = ['recoveries', 'collection_recovery_fee','collection_12_months_medical','total_collection_amount']
    credit_limit = [ 'revolving_balance','total_current_balance', 'total_revolving_credit_limit']
    loan = ['loan_amount','funded_amount','funded_amount_investor']
    to_scale = list(set([col for col in df.columns if df[col].dtype !="object"])-set(recoveries)-set(credit_limit) -set(loan))
    to_scale.remove("public_record") 
    to_scale.remove("loan_status")
    return recoveries,credit_limit,loan,to_scale

In [54]:
lastfill_outliers(df) #Handle Outliers for training set 
recoveries,credit_limit,loan,to_scale = preprocess(df) #Generate similar colums for PCA

In [55]:
# Map all data transformations for the pipeline
# This ensures that Pre-processing can be a part of the pipeline
mapper = DataFrameMapper([ (['employment_duration'], OneHotEncoder(handle_unknown='ignore'), {'alias': 'employment_duration'}),
                          (['verification_status'], OneHotEncoder(handle_unknown='ignore'), {'alias': 'verification_status'}),
                          (['initial_list_status'], OneHotEncoder(handle_unknown='ignore'), {'alias': 'initial_list_status'}),
                          (['application_type'], OneHotEncoder(handle_unknown='ignore'), {'alias': 'application_type'}),
                          (['loan_title'], OneHotEncoder(handle_unknown='ignore'), {'alias': 'loan_title'}),
                          (['grade'], LabelEncoder(), {'alias': 'grade'}),
                          (['public_record'], LabelEncoder(), {'alias': 'public_record'}),
                          (recoveries, make_pipeline(StandardScaler(),PCA(2)), {'alias': 'recoveries'}),
                          (credit_limit, make_pipeline(StandardScaler(),PCA(1)), {'alias': 'credit_limit'}),
                          (loan, make_pipeline(StandardScaler(),PCA(1)), {'alias': 'loan'}),
                          (to_scale, StandardScaler(), {'alias': 'scaled'}),
                          (['sub_grade','batch_enrolled'], make_pipeline(OneHotEncoder(handle_unknown='ignore'), TruncatedSVD(n_components=3)), {'alias': 'svd'})
                          ])

In [59]:
all_features = FeatureUnion([('mapper', mapper),('under_financed', under_financed)])
y = df["loan_status"]
X=df.drop(["loan_status"],axis=1)
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=2020)

In [60]:
# LGB Model with hyper parameter tuning
pipe = Pipeline([
    ('featurize', all_features),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100))),
    ('lgb', LGBMClassifier())])

#pipe_smote = make_pipeline

param_grid ={  
             'lgb__min_child_weight': [1e-5, 1e-3, 1e-2, 1e-1, 1, 1e1, 1e2, 1e3, 1e4],
             'lgb__max_depth': [5, 10, 15],
             'lgb__learning_rate': [0.1, 0.05]}

#need to fit pipeline and print out support metrics in order to get selected features (only needed once)
#pipe.fit(X,y)
#support = pipe.named_steps['feature_selection'].get_support()

#calling pipeline with gridsearch
kf = StratifiedKFold(n_splits=5)
lgb = GridSearchCV(pipe, param_grid,cv=kf, n_jobs=-1)
lgb.fit(X, y)


print("CV scores for best parameters (CV score=%0.5f):" % lgb.best_score_)
print('Best parameters: ', lgb.best_params_)
print('Train Accuracy :', lgb.best_estimator_.score(X,y))

predicted_lgb = lgb.best_estimator_.predict(X_test)
print('See below the classification report based on Test data')
print(classification_report(y_test, predicted_lgb))

#computing default probabilities for risk management on train data
default_proba_lgb = lgb.best_estimator_.predict_proba(X)[:, 1]
default_proba_lgb_test = lgb.best_estimator_.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y, default_proba_lgb)
print('AUC score for LightGBM on train data is',round(auc(fpr, tpr),4))

fpr_test, tpr_test, thresholds = roc_curve(y_test, default_proba_lgb_test)
print('AUC score for LightGBM on test data is',round(auc(fpr_test, tpr_test),4))

CV scores for best parameters (CV score=0.90730):
Best parameters:  {'lgb__learning_rate': 0.1, 'lgb__max_depth': 5, 'lgb__min_child_weight': 1e-05}
Train Accuracy : 0.9073559384843431
See below the classification report based on Test data
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     12256
           1       0.00      0.00      0.00      1237

    accuracy                           0.91     13493
   macro avg       0.45      0.50      0.48     13493
weighted avg       0.83      0.91      0.86     13493

AUC score for LightGBM on train data is 0.7946
AUC score for LightGBM on test data is 0.5034


In [61]:
# Logistic Regression with Hyperparameter tuning
pipe = Pipeline([
    ('featurize', all_features),
    ('feature_selection', SelectFromModel(RandomForestClassifier(n_estimators=100))),
    ('lm', LogisticRegression())])



param_grid = {
    'lm__C': np.logspace(-4, 4, 4),
    'lm__penalty':['l1','l2']
}

#need to fit pipeline and print out support metrics in order to get selected features (only needed once)
#pipe.fit(X,y)
#support = pipe.named_steps['feature_selection'].get_support()

#calling pipeline with gridsearch
log_reg = GridSearchCV(pipe, param_grid,cv=5, n_jobs=-1)
log_reg.fit(X, y)


print("CV scores for best parameters (CV score=%0.5f):" % log_reg.best_score_)
print('Best parameters: ', log_reg.best_params_)
print('Train Accuracy :', log_reg.best_estimator_.score(X,y))

predicted_lr = log_reg.best_estimator_.predict(X_test)
print('See below the classification report based on Test data')
print(classification_report(y_test, predicted_lr))

#computing default probabilities for risk management on train data
default_proba_lr = log_reg.best_estimator_.predict_proba(X)[:, 1]
default_proba_lr_test = log_reg.best_estimator_.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y, default_proba_lr)
print('AUC score for Logistic Regression on train data is',round(auc(fpr, tpr),4))

fpr_test, tpr_test, thresholds = roc_curve(y_test, default_proba_lr_test)
print('AUC score for Logistic Regression on test data is',round(auc(fpr_test, tpr_test),4))


CV scores for best parameters (CV score=0.90728):
Best parameters:  {'lm__C': 0.0001, 'lm__penalty': 'l2'}
Train Accuracy : 0.9072818232351306
See below the classification report based on Test data
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     12256
           1       0.00      0.00      0.00      1237

    accuracy                           0.91     13493
   macro avg       0.45      0.50      0.48     13493
weighted avg       0.83      0.91      0.86     13493

AUC score for Logistic Regression on train data is 0.5194
AUC score for Logistic Regression on test data is 0.5014


In [65]:
test = pd.read_csv("test.csv")

In [66]:
test.columns = test.columns.str.replace(' ','_')
test.columns = test.columns.map(str.lower)
test.drop(["payment_plan","accounts_delinquent","id","loan_status"],inplace=True,axis=1)
test.loan_title = test.loan_title.map(str.lower)
test.loan_title = test.loan_title.map(loan_dict)
lastfill_outliers(test)
test.head()

Unnamed: 0,loan_amount,funded_amount,funded_amount_investor,term,batch_enrolled,interest_rate,grade,sub_grade,employment_duration,home_ownership,...,total_received_interest,total_received_late_fee,recoveries,collection_recovery_fee,collection_12_months_medical,application_type,last_week_pay,total_collection_amount,total_current_balance,total_revolving_credit_limit
0,17120,10365,16025.08269,59,BAT2575549,12.163926,A,D1,RENT,76468.8219,...,4469.449851,0.088031,8.425776,0.731797,0,INDIVIDUAL,135,24,475442,4364
1,7133,11650,12615.7956,59,BAT2833642,6.564296,B,E3,MORTGAGE,38079.01344,...,993.90753,0.041237,6.157008,0.992918,0,INDIVIDUAL,56,1,72412,2573
2,25291,25825,11621.28083,59,BAT1761981,14.7299,A,C3,MORTGAGE,51275.93268,...,729.113379,0.021745,5.705077,0.28158,0,INDIVIDUAL,3,26,284825,19676
3,30781,9664,15375.82351,59,BAT5341619,10.523767,A,A2,RENT,68867.98965,...,715.867091,0.092398,2.469688,0.959162,0,INDIVIDUAL,21,32,40842,7226
4,8878,9419,7176.647582,58,BAT4694572,9.997013,C,B3,OWN,91556.85423,...,248.572854,0.010354,2.127835,0.402315,0,INDIVIDUAL,104,33,90825,26145


In [67]:
default_proba_lgb_test = lgb.best_estimator_.predict_proba(test)[:, 1]

In [68]:
submission = pd.DataFrame(default_proba_lgb_test, columns = ['Loan Status'])
submission.to_csv("submission-lgbfe3-rf.csv", index=False)

In [69]:
default_proba_lr_test = log_reg.best_estimator_.predict_proba(test)[:, 1]
submission = pd.DataFrame(default_proba_lr_test, columns = ['Loan Status'])
submission.to_csv("submission-lrfe3-rf.csv", index=False)

In [None]:
pipe = Pipeline([
    ('featurize', all_features),
    ('feature_selection', SelectFromModel(LogisticRegression(C=1, penalty='l2'))),
    ('MLPClassifier', MLPClassifier(random_state = 42))])

param_grid = [{
    'MLPClassifier__solver': ['lbfgs', 'sgd', 'adam'],
    'MLPClassifier__max_iter': [100,200,300,500],
    'MLPClassifier__activation' : ['relu','logistic','tanh'],
    'MLPClassifier__hidden_layer_sizes':[(2,), (4,),(2,2),(4,4),(4,2),(10,10),(2,2,2)],}]

#need to fit pipeline and print out support metrics in order to get selected features (only needed once)
pipe.fit(X,y)
support = pipe.named_steps['feature_selection'].get_support()

#calling pipeline with gridsearch
mlp = GridSearchCV(pipe, param_grid,cv=5, n_jobs=-1)
mlp.fit(X, y)



print("CV scores for best parameters (CV score=%0.5f):" % mlp.best_score_)
print('Best parameters: ',mlp.best_params_)
print('Train Accuracy :', mlp.best_estimator_.score(X,y))

predicted_mlp = mlp.best_estimator_.predict(X_test)
print('See below the classification report based on Test data')
print(classification_report(y_test, predicted_mlp))

#computing default probabilities for risk management on train data
default_proba_mlp = mlp.best_estimator_.predict_proba(X)[:, 1]
default_proba_mlp_test = mlp.best_estimator_.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y, default_proba_mlp)
print('AUC score for MLP on train data is',round(auc(fpr, tpr),4))

fpr_test, tpr_test, thresholds = roc_curve(y_test, default_proba_mlp_test)
print('AUC score for MLP on test data is',round(auc(fpr_test, tpr_test),4))

In [None]:
pipe = Pipeline([
    ('featurize', all_features),
    ('feature_selection', SelectFromModel(LogisticRegression(C=1, penalty='l2'))),
    ('xgb', XGBClassifier())])

param_grid ={ 'xgb__eta':[0.01,0.015, 0.025, 0.05, 0.1],
'xgb__gamma':[0.05,0.06,0.07,0.08,0.09,0.1,0.3,0.5,0.7,0.9,1.0],
'xgb__max_depth':[3, 5, 7, 9, 12, 15, 17, 25],
'xgb__min_child_weight':[1, 3, 5, 7],
'xgb__subsample':[0.6, 0.7, 0.8, 0.9, 1.0],
'xgb__colsample_bytree':[0.6, 0.7, 0.8, 0.9, 1.0],
'xgb__lambda':[0.01,0.02,0.03,0.04,0.05,0.06,0.07,0.08,0.09,0.1, 1.0],
'xgb__alpha':[0, 0.1, 0.5, 1.0]}

#need to fit pipeline and print out support metrics in order to get selected features (only needed once)
pipe.fit(X,y)
support = pipe.named_steps['feature_selection'].get_support()

#calling pipeline with gridsearch
xgb = GridSearchCV(pipe, param_grid,cv=5, n_jobs=-1)
xgb.fit(X, y)


print("CV scores for best parameters (CV score=%0.5f):" % xgb.best_score_)
print('Best parameters: ', xgb.best_params_)
print('Train Accuracy :', xgb.best_estimator_.score(X,y))

predicted_xgb = xgb.best_estimator_.predict(X_test)
print('See below the classification report based on Test data')
print(classification_report(y_test, predicted_xgb))

#computing default probabilities for risk management on train data
default_proba_xgb = xgb.best_estimator_.predict_proba(X)[:, 1]
default_proba_xgb_test = xgb.best_estimator_.predict_proba(X_test)[:, 1]

fpr, tpr, thresholds = roc_curve(y, default_proba_xgb)
print('AUC score for xgboost on train data is',round(auc(fpr, tpr),4))

fpr_test, tpr_test, thresholds = roc_curve(y_test, default_proba_xgb_test)
print('AUC score for xgboost on test data is',round(auc(fpr_test, tpr_test),4))



In [72]:
def datamapper(df,type="train",cols=[]):
    if type=="train":
        X_map = mapper.fit_transform(df)
        cols = mapper.transformed_names_
        cols[cols.index('scaled_0'):cols.index('scaled_0')+len(to_scale)] = to_scale     
    else:
        X_map = mapper.transform(df)
    X_df = pd.DataFrame(X_map,columns=cols)
    return X_df,cols