In [1]:
# Load the packages needed for this part
# create spark and sparkcontext objects
from pyspark.sql import SparkSession
import numpy as np

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

import pyspark
from pyspark.ml import feature, regression, Pipeline, classification, pipeline, evaluation
from pyspark.ml.classification import LogisticRegression
from pyspark.sql import functions as fn, Row
from pyspark.sql.functions import when, regexp_extract, col
from pyspark import sql
from pyspark.sql.functions import *

from pyspark.ml.classification import DecisionTreeClassifier
from pyspark.ml.feature import VectorIndexer

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import sys

In [2]:
pd.set_option('display.max_columns', 500)
loan_df = spark.read.csv('/FileStore/tables/loan_default.csv', header=True, inferSchema=True)

In [3]:
#Creating a copy of the original dataframe
loan_copy_df = loan_df
loan_copy_df = loan_copy_df
loan_copy_df.toPandas().head()

Unnamed: 0,id,year,issue_d,final_d,emp_length_int,home_ownership,home_ownership_cat,income_category,annual_inc,income_cat,loan_amount,term,term_cat,application_type,application_type_cat,purpose,purpose_cat,interest_payments,interest_payment_cat,loan_condition,loan_condition_cat,interest_rate,grade,grade_cat,dti,total_pymnt,total_rec_prncp,recoveries,installment,region
0,1077501,2011,01/12/2011,1012015,10.0,RENT,1,Low,24000,1,5000,36 months,1,INDIVIDUAL,1,credit_card,1,Low,1,Good Loan,0,10.65,B,2,27.65,5861.071414,5000.0,0.0,162.87,munster
1,1077430,2011,01/12/2011,1042013,0.5,RENT,1,Low,30000,1,2500,60 months,2,INDIVIDUAL,1,car,2,High,2,Bad Loan,1,15.27,C,3,1.0,1008.71,456.46,117.08,59.83,leinster
2,1077175,2011,01/12/2011,1062014,10.0,RENT,1,Low,12252,1,2400,36 months,1,INDIVIDUAL,1,small_business,3,High,2,Good Loan,0,15.96,C,3,8.72,3003.653644,2400.0,0.0,84.33,cannught
3,1076863,2011,01/12/2011,1012015,10.0,RENT,1,Low,49200,1,10000,36 months,1,INDIVIDUAL,1,other,4,High,2,Good Loan,0,13.49,C,3,20.0,12226.30221,10000.0,0.0,339.31,ulster
4,1075358,2011,01/12/2011,1012016,1.0,RENT,1,Low,80000,1,3000,60 months,2,INDIVIDUAL,1,other,4,Low,1,Good Loan,0,12.69,B,2,17.94,3242.17,2233.1,0.0,67.79,ulster


#Exploratory Data Analysis

In [5]:
#Checking for null values
loan_copy_df.select([count(when(isnan(c), c)).alias(c) for c in loan_copy_df.columns]).toPandas().head()

Unnamed: 0,id,year,issue_d,final_d,emp_length_int,home_ownership,home_ownership_cat,income_category,annual_inc,income_cat,loan_amount,term,term_cat,application_type,application_type_cat,purpose,purpose_cat,interest_payments,interest_payment_cat,loan_condition,loan_condition_cat,interest_rate,grade,grade_cat,dti,total_pymnt,total_rec_prncp,recoveries,installment,region
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [6]:
loan_copy_df.groupBy('year').count().show()
loan_copy_df.groupBy('home_ownership', 'home_ownership_cat').count().sort('home_ownership_cat').show()
loan_copy_df.groupBy('income_category', 'income_cat').count().sort('income_cat').show()
loan_copy_df.groupBy('interest_payments', 'interest_payment_cat').count().sort('interest_payment_cat').show()
loan_copy_df.groupBy('term', 'term_cat').count().sort('term_cat').show()
loan_copy_df.groupBy('application_type', 'application_type_cat').count().sort('application_type_cat').show()
loan_copy_df.groupBy('purpose', 'purpose_cat').count().sort('purpose_cat').show()
loan_copy_df.groupBy('grade', 'grade_cat').count().sort('grade_cat').show()

In [7]:
#List of Column names present in the dataframe and their types
loan_copy_df.dtypes

In [8]:
#Descriptive Statistics applied on our Loan Dataset
loan_copy_df.toPandas().describe()

Unnamed: 0,id,year,final_d,emp_length_int,home_ownership_cat,annual_inc,income_cat,loan_amount,term_cat,application_type_cat,purpose_cat,interest_payment_cat,loan_condition_cat,interest_rate,grade_cat,dti,total_pymnt,total_rec_prncp,recoveries,installment
count,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0,887379.0
mean,32465130.0,2014.021761,1047089.0,6.050564,2.09913,75027.59,1.196702,14755.264605,1.300045,1.000576,4.874621,1.475629,0.075987,13.24674,2.798403,18.157039,7558.826684,5757.706424,45.919243,436.717127
std,22827340.0,1.261741,45551.49,3.507405,0.944839,64698.15,0.442542,8435.455601,0.458278,0.02399,2.381156,0.499406,0.264977,4.381867,1.312599,17.190626,7871.243336,6625.441046,409.693874,244.186593
min,54734.0,2007.0,1012008.0,0.5,1.0,0.0,1.0,500.0,1.0,1.0,1.0,1.0,0.0,5.32,1.0,0.0,0.0,0.0,0.0,15.67
25%,9206643.0,2013.0,1012016.0,3.0,1.0,45000.0,1.0,8000.0,1.0,1.0,3.0,1.0,0.0,9.99,2.0,11.91,1914.59,1200.57,0.0,260.705
50%,34433270.0,2014.0,1012016.0,6.05,3.0,65000.0,1.0,13000.0,1.0,1.0,6.0,1.0,0.0,12.99,3.0,17.65,4894.999117,3215.32,0.0,382.55
75%,54908140.0,2015.0,1092015.0,10.0,3.0,90000.0,1.0,20000.0,2.0,1.0,6.0,2.0,0.0,16.2,4.0,23.95,10616.81423,8000.0,0.0,572.6
max,68617060.0,2015.0,1122015.0,10.0,6.0,9500000.0,3.0,35000.0,2.0,2.0,14.0,2.0,1.0,28.99,7.0,9999.0,57777.57987,35000.03,33520.27,1445.46


In [9]:
loan_copy_df.toPandas().head()

Unnamed: 0,id,year,issue_d,final_d,emp_length_int,home_ownership,home_ownership_cat,income_category,annual_inc,income_cat,loan_amount,term,term_cat,application_type,application_type_cat,purpose,purpose_cat,interest_payments,interest_payment_cat,loan_condition,loan_condition_cat,interest_rate,grade,grade_cat,dti,total_pymnt,total_rec_prncp,recoveries,installment,region
0,1077501,2011,01/12/2011,1012015,10.0,RENT,1,Low,24000,1,5000,36 months,1,INDIVIDUAL,1,credit_card,1,Low,1,Good Loan,0,10.65,B,2,27.65,5861.071414,5000.0,0.0,162.87,munster
1,1077430,2011,01/12/2011,1042013,0.5,RENT,1,Low,30000,1,2500,60 months,2,INDIVIDUAL,1,car,2,High,2,Bad Loan,1,15.27,C,3,1.0,1008.71,456.46,117.08,59.83,leinster
2,1077175,2011,01/12/2011,1062014,10.0,RENT,1,Low,12252,1,2400,36 months,1,INDIVIDUAL,1,small_business,3,High,2,Good Loan,0,15.96,C,3,8.72,3003.653644,2400.0,0.0,84.33,cannught
3,1076863,2011,01/12/2011,1012015,10.0,RENT,1,Low,49200,1,10000,36 months,1,INDIVIDUAL,1,other,4,High,2,Good Loan,0,13.49,C,3,20.0,12226.30221,10000.0,0.0,339.31,ulster
4,1075358,2011,01/12/2011,1012016,1.0,RENT,1,Low,80000,1,3000,60 months,2,INDIVIDUAL,1,other,4,Low,1,Good Loan,0,12.69,B,2,17.94,3242.17,2233.1,0.0,67.79,ulster


In [10]:
#Renaming Target variable i.e. Loan_Condition_Cat to Default
loan_copy_df = loan_copy_df.withColumnRenamed('loan_condition_cat', 'default')

In [11]:
#Correlation Matrix
plt.figure(figsize=(18, 12))
corr = loan_copy_df.toPandas().corr()
ax = sns.heatmap(corr, cmap="YlGnBu", annot=True)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
display(ax.figure)

In [12]:
plt.figure(figsize=(12,6))
ax_yearCat = sns.countplot(x='year',data=loan_copy_df.toPandas())
ax_yearCat.set_title('Loan applicants per Year')
ax_yearCat.set_ylabel('Count')
ax_yearCat.set_xlabel('Year')
display(ax_yearCat.figure)

In [13]:
#Visualizing the categorical variables
#Counting number of defaulters in income categories
plt.figure(figsize=(12,6))
ax_incomeCat = sns.countplot(x='income_category',data=loan_copy_df.toPandas(), hue='default')

ax_incomeCat.set_title('Loan defaults based on income')
ax_incomeCat.set_ylabel('Count')
ax_incomeCat.set_xlabel('Income Category')
display(ax_incomeCat.figure)
#A comparatively larger number of low income people have defaulted their loans

In [14]:
#counting the number of defaulters across different purpose categories
plt.figure(figsize=(20,7))
ax_purposeCat = sns.countplot(x='purpose', data=loan_copy_df.toPandas(), hue='default')
ax_purposeCat.set_xticklabels(ax_purposeCat.get_xticklabels(), rotation=20)
ax_purposeCat.set_title('Purpose for applying Loan')
ax_purposeCat.set_ylabel('Count')
ax_purposeCat.set_xlabel('Purpose')

display(ax_purposeCat.figure)

#A comparatively larger number of purpose category 6 which is debt consolidation people have defaulted their loans

In [15]:
#counting the number of defaulters across different term categories
plt.figure(figsize=(7,4))
ax_termCat = sns.countplot(x='term', data=loan_copy_df.toPandas(), hue='default')
ax_termCat.set_title('Loan Duration')
ax_termCat.set_ylabel('Count')
ax_termCat.set_xlabel('Terms')
display(ax_termCat.figure)

#A comparatively larger number of 36 month term people have defaulted their loans

In [16]:
%sh
#pip install --upgrade pip
python -mpip install seaborn==0.9.0

In [17]:
%sh
pip show seaborn

In [18]:
#Relationship between fairly correlated features from the correlation chart
#loan amount and installments they are 94% correlated
loan_inst = loan_copy_df.toPandas().plot.scatter(x = 'loan_amount', y = 'installment', color = 'DarkBlue')
loan_inst.title.set_text('Loan Amount Vs Installment')

display(loan_inst.figure)

In [19]:
loan_purpose = loan_copy_df.toPandas().plot.scatter(x = 'grade_cat', y = 'interest_rate', color = 'Green')
loan_purpose.title.set_text('Installment Rate Vs Grade')
display(loan_purpose.figure)

In [20]:
loan_grade = sns.regplot(x='grade_cat', y='interest_rate', data=loan_copy_df.toPandas())
display(loan_grade.figure)

In [21]:
#detecting outliers with annual income and defaulters
plt.figure(figsize=(5,5))
sns.set_style("whitegrid")
annual_inc_box = sns.boxplot(x='default', y='annual_inc', data=loan_copy_df.toPandas())
annual_inc_box.set_title('Default based on Annual Income ')

annual_inc_box.set(ylim=(0, 500000))
display(annual_inc_box.figure)

In [22]:
#detecting outliers with defaulters and installment
plt.figure(figsize=(5,5))
sns.set_style("whitegrid")
loan_box = sns.boxplot(x='default', y='installment', data=loan_copy_df.toPandas())
loan_box.set_title('Default based on Monthly Installment ')
#loan_box.set(ylim=(0, 100000))
display(loan_box.figure)

In [23]:
#detecting outliers with purpose and installment
plt.figure(figsize=(6,5))
sns.set_style("whitegrid")
purpose_box = sns.boxplot(x='purpose', y='installment', data=loan_copy_df.toPandas())
purpose_box.set_xticklabels(ax_purposeCat.get_xticklabels(), rotation=35)
purpose_box.set_title('Purpose for applying Loan')
purpose_box.set_ylabel('Installments')
purpose_box.set_xlabel('Purpose')

display(purpose_box.figure)

In [24]:
#Filtering out bad loans to analyze
bad_loans = loan_copy_df.toPandas().loc[loan_copy_df.toPandas().default == 1]

In [25]:
#Filtering out good loans to analyze
good_loans = loan_copy_df.toPandas().loc[loan_copy_df.toPandas().default == 0]

In [26]:
bad_loans.head()

Unnamed: 0,id,year,issue_d,final_d,emp_length_int,home_ownership,home_ownership_cat,income_category,annual_inc,income_cat,loan_amount,term,term_cat,application_type,application_type_cat,purpose,purpose_cat,interest_payments,interest_payment_cat,loan_condition,default,interest_rate,grade,grade_cat,dti,total_pymnt,total_rec_prncp,recoveries,installment,region
1,1077430,2011,01/12/2011,1042013,0.5,RENT,1,Low,30000,1,2500,60 months,2,INDIVIDUAL,1,car,2,High,2,Bad Loan,1,15.27,C,3,1.0,1008.71,456.46,117.08,59.83,leinster
8,1071795,2011,01/12/2011,1042012,4.0,OWN,2,Low,40000,1,5600,60 months,2,INDIVIDUAL,1,small_business,3,High,2,Bad Loan,1,21.28,F,6,5.55,646.02,162.02,189.06,152.39,ulster
9,1071570,2011,01/12/2011,1112012,0.5,RENT,1,Low,15000,1,5375,60 months,2,INDIVIDUAL,1,other,4,Low,1,Bad Loan,1,12.69,B,2,18.08,1476.19,673.48,269.29,121.45,munster
12,1064687,2011,01/12/2011,1072012,0.5,RENT,1,Low,30000,1,9000,36 months,1,INDIVIDUAL,1,debt_consolidation,6,High,2,Bad Loan,1,13.49,C,3,10.08,2270.7,1256.14,444.3,305.38,leinster
14,1069057,2011,01/12/2011,1102013,3.0,RENT,1,Low,100000,1,10000,36 months,1,INDIVIDUAL,1,other,4,Low,1,Bad Loan,1,10.65,B,2,7.06,7471.99,5433.47,645.1,325.74,ulster


In [27]:
#detecting outliers with home purpose and annual_income in bad_loans
plt.figure(figsize=(7,7))
sns.set_style("whitegrid")
home_ann_box = sns.boxplot(x='purpose', y='loan_amount', data=bad_loans)
home_ann_box.set(ylim=(0, 50000))
display(home_ann_box.figure)

In [28]:
#detecting outliers with home purpose and annual_income in good_loans
plt.figure(figsize=(10,7))
sns.set_style("whitegrid")
home_good_box = sns.boxplot(x='purpose', y='annual_inc', data=good_loans)
home_good_box.set(ylim=(0, 500000))
display(home_good_box.figure)

In [29]:
data = bad_loans['home_ownership'].value_counts()
Bad_loans_home_sum = pd.DataFrame(data)
Bad_loans_home_sum.reset_index(level = 0, inplace=True)

In [30]:
# Create a list of colors (from iWantHue)
colors = ["#E13F29", "#D69A80", "#D63B59", "#AE5552", "#CB5C3B", "#EB8076", "#96624E", "#96624F", "#96524E"]

# Create a pie chart
fig = plt.figure(figsize=(4,4))
plt.pie(
    # using data total)arrests
    Bad_loans_home_sum['home_ownership'],
    # with the labels being officer names
    labels=Bad_loans_home_sum['index'],
    # with no shadows
    shadow=True,
    # with colors
    colors=colors,
    # with one slide exploded out
    explode=(0, 0, 0.15, 0, 0),
    # with the start angle at 90%
    startangle=90,
    # with the percent listed as a fraction
    autopct='%1.1f%%',
    )

# View the plot drop above
plt.axis('equal')

# View the plot
plt.tight_layout()
plt.title('Types of bad loan home owners')
display(fig)

In [31]:
data = good_loans['home_ownership'].value_counts()
good_loans_home_sum = pd.DataFrame(data)
good_loans_home_sum.reset_index(level = 0, inplace=True)
good_loans_home_sum

Unnamed: 0,index,home_ownership
0,MORTGAGE,413858
1,RENT,324726
2,OWN,81177
3,OTHER,144
4,NONE,42
5,ANY,3


In [32]:
# Create a list of colors (from iWantHue)
colors = ["#E13F29", "#D69A80", "#D63B59", "#AE5552", "#CB5C3B", "#EB8076", "#96624E", "#96624F", "#96524E"]

# Create a pie chart
fig_good = plt.figure(figsize=(4,4))
plt.pie(
    # using data total)arrests
    good_loans_home_sum['home_ownership'],
    # with the labels being officer names
    labels=good_loans_home_sum['index'],
    # with no shadows
    shadow=True,
    # with colors
    colors=colors,
    # with one slide exploded out
    explode=(0, 0, 0.15, 0, 0, 0),
    # with the start angle at 90%
    startangle=90,
    # with the percent listed as a fraction
    autopct='%1.1f%%',
    )

# View the plot drop above
plt.axis('equal')

# View the plot
plt.tight_layout()
plt.title('Types of good loan home owners')
display(fig_good)

In [33]:
plt.figure(figsize = (6,6))
sns.countplot(x="grade",data=loan_copy_df.toPandas(), palette= "YlGnBu")
plt.xticks(rotation=10)
plt.title("Grade", fontsize=20)
plt.xlabel("Grade", fontsize=10)
plt.ylabel("Number of Loans", fontsize=20)
display(plt.draw())

In [34]:
#Graph3
fig, ax = plt.subplots(figsize=(7,5))
loan_copy_df.toPandas().groupby(['issue_d']).count()['loan_condition'].plot(ax=ax)
display(fig)

In [35]:
f, ax = plt.subplots(figsize=(12,8))

colors = ["#3791D7", "#D72626"]
labels ="Good Loan", "Bad Loan"

plt.suptitle('Loan Condition', fontsize=20)
plt.axis('off')

loan_copy_df.toPandas().loan_condition.value_counts().plot.pie(explode=[0,0.25], autopct='%1.2f%%', shadow=True, colors=colors, 
                                             labels=labels, fontsize=12, startangle=70)
display(f)

In [36]:
fig, ((ax1), (ax2))= plt.subplots(nrows=1, ncols=2, figsize=(14,6))

sns.violinplot(x="grade", y="loan_amount", data=loan_copy_df.toPandas(), palette="Set2", ax=ax1)
sns.boxplot(x="grade", y="total_pymnt", data=loan_copy_df.toPandas(), palette="Set2", ax=ax2)
display(fig)

In [37]:
dummy_grade = pd.get_dummies(loan_copy_df.toPandas().grade)

#Feature Engineering

In [39]:
dummy = loan_copy_df.toPandas()
dummy_loanCondition = pd.get_dummies(dummy['loan_condition'])
dummy_loanCondition = pd.concat([dummy, dummy_loanCondition], axis = 1)

dummy_gradeCat = dummy_loanCondition
dummy_gradeCat = pd.get_dummies(dummy_gradeCat['grade'])
dummy_gradeCat = pd.concat([dummy_loanCondition, dummy_gradeCat], axis = 1)

dummy_homeOwnCat = dummy_gradeCat
dummy_homeOwnCat = pd.get_dummies(dummy_homeOwnCat['home_ownership'])
dummy_homeOwnCat = pd.concat([dummy_gradeCat, dummy_homeOwnCat], axis = 1)

dummy_incomeCat = dummy_homeOwnCat
dummy_incomeCat = pd.get_dummies(dummy_incomeCat['income_category'])
dummy_incomeCat = pd.concat([dummy_homeOwnCat, dummy_incomeCat], axis = 1)

dummy_purposeCat = dummy_incomeCat
dummy_purposeCat = pd.get_dummies(dummy_purposeCat['purpose'])
dummy_purposeCat = pd.concat([dummy_incomeCat, dummy_purposeCat], axis = 1)

dummy_termCat = dummy_purposeCat
dummy_termCat = pd.get_dummies(dummy_termCat['term_cat'])
dummy_termCat = pd.concat([dummy_purposeCat, dummy_termCat], axis = 1)

loan_dummy_df = dummy_termCat

#loan_dummy_df = loan_dummy_df.drop(['Good Loan', 'Bad Loan', 'ANY', '3', 'other', '60 months'], axis=1)

In [40]:
loan_dummy_df_final = loan_dummy_df.drop(['G', 'Low', 'Good Loan', 'Bad Loan', 'ANY', 'other'], axis = 1)
loan_dummy_df_final1 = loan_dummy_df_final.drop(2, axis = 1)

In [41]:
#Rounding the interest rate
loan_dummy_df_final1['interest_rate'] = loan_dummy_df_final1['interest_rate'].round(0)

In [42]:
loan_linear_reg = spark.createDataFrame(loan_dummy_df_final1)

In [43]:
loan_dummy_df_final1.head()

Unnamed: 0,id,year,issue_d,final_d,emp_length_int,home_ownership,home_ownership_cat,income_category,annual_inc,income_cat,loan_amount,term,term_cat,application_type,application_type_cat,purpose,purpose_cat,interest_payments,interest_payment_cat,loan_condition,default,interest_rate,grade,grade_cat,dti,total_pymnt,total_rec_prncp,recoveries,installment,region,A,B,C,D,E,F,MORTGAGE,NONE,OTHER,OWN,RENT,High,Medium,car,credit_card,debt_consolidation,educational,home_improvement,house,major_purchase,medical,moving,renewable_energy,small_business,vacation,wedding,1
0,1077501,2011,01/12/2011,1012015,10.0,RENT,1,Low,24000,1,5000,36 months,1,INDIVIDUAL,1,credit_card,1,Low,1,Good Loan,0,11.0,B,2,27.65,5861.071414,5000.0,0.0,162.87,munster,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
1,1077430,2011,01/12/2011,1042013,0.5,RENT,1,Low,30000,1,2500,60 months,2,INDIVIDUAL,1,car,2,High,2,Bad Loan,1,15.0,C,3,1.0,1008.71,456.46,117.08,59.83,leinster,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1077175,2011,01/12/2011,1062014,10.0,RENT,1,Low,12252,1,2400,36 months,1,INDIVIDUAL,1,small_business,3,High,2,Good Loan,0,16.0,C,3,8.72,3003.653644,2400.0,0.0,84.33,cannught,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
3,1076863,2011,01/12/2011,1012015,10.0,RENT,1,Low,49200,1,10000,36 months,1,INDIVIDUAL,1,other,4,High,2,Good Loan,0,13.0,C,3,20.0,12226.30221,10000.0,0.0,339.31,ulster,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1075358,2011,01/12/2011,1012016,1.0,RENT,1,Low,80000,1,3000,60 months,2,INDIVIDUAL,1,other,4,Low,1,Good Loan,0,13.0,B,2,17.94,3242.17,2233.1,0.0,67.79,ulster,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [44]:
columns_list = ['loan_amount','default','dti','total_pymnt','total_rec_prncp','recoveries','installment','interest_rate','A','B','C','D','E','F','MORTGAGE','NONE','OTHER','OWN','RENT','High','Medium','car','credit_card','debt_consolidation','educational','home_improvement','house','major_purchase','medical','moving','renewable_energy','small_business','vacation','wedding','1']

for col in columns_list:
  loan_linear_reg = loan_linear_reg.withColumn(col, loan_linear_reg[col].cast('Integer'))

#Model Creation

In [46]:
#Splitting the dataset 
training_df, validation_df, testing_df = loan_linear_reg.randomSplit([0.6, 0.3, 0.1], seed=100)

In [47]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator

##Building a classification model using Logistic Regression

In [49]:
dataset_size=float(training_df.select("default").count())
numPositives=training_df.select("default").where('default == 0').count()
per_ones=(float(numPositives)/float(dataset_size))*100
numNegatives=float(dataset_size-numPositives)
print('The number of ones are {}'.format(numPositives))
print('Percentage of ones are {}'.format(per_ones))

In [50]:
BalancingRatio= numNegatives/dataset_size
print('BalancingRatio = {}'.format(BalancingRatio))

In [51]:
training_df=training_df.withColumn("classWeights", fn.when(training_df.default == 0,BalancingRatio).otherwise(1-BalancingRatio))
training_df.select("classWeights").show(5)

In [52]:
cols = ['total_pymnt','loan_amount', 'dti', 'installment', 'total_rec_prncp', 'interest_rate', 'A', 'B', 'C', 'D', 'E', 'F', 'MORTGAGE','NONE','OTHER','OWN','RENT', 'car','credit_card','debt_consolidation','educational','home_improvement','house','major_purchase','medical','moving','renewable_energy','small_business','vacation','wedding','1']

va = VectorAssembler(inputCols=cols, outputCol='features')
sc = StandardScaler(withMean=True, withStd=False, inputCol='features', outputCol='std_features')
lr = LogisticRegression().setLabelCol('default').setFeaturesCol('std_features').setWeightCol('classWeights')

lr_Model = Pipeline(stages=[va,sc, lr]).fit(training_df)
lr_prediction = lr_Model.transform(testing_df)

In [53]:
lr_prediction.select("prediction", "default", "features").show()

In [54]:
#Accuracy of Logistic Regression
evaluator = BinaryClassificationEvaluator(labelCol = 'default')
lr_accuracy = evaluator.evaluate(lr_Model.transform(validation_df))
print("Accuracy of LogisticRegression is = %g"% (lr_accuracy))
print("Test Error of LogisticRegression = %g " % (1.0 - lr_accuracy))

In [55]:
AUC_test = evaluator.evaluate(lr_prediction)
print(AUC_test)

In [56]:
trainingSummary = lr_Model.stages[-1].summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
a = plt.show()
display(a)

In [57]:
print('Training set areaUnderROC: ' + str(trainingSummary.areaUnderROC))

In [58]:
pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
b = plt.show()
display(b)

In [59]:
tn = lr_prediction[(lr_prediction.default == 0) & (lr_prediction.prediction == 0)].count()
tp = lr_prediction[(lr_prediction.default == 1) & (lr_prediction.prediction == 1)].count()
fn = lr_prediction[(lr_prediction.default == 1) & (lr_prediction.prediction == 0)].count()
fp = lr_prediction[(lr_prediction.default == 0) & (lr_prediction.prediction == 1)].count()
precision = ((float(tp) / (float(tp) + float(fp))) * 100 )
recall = ((float(tp) / (float(tp) + float(fn))) * 100 )
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Precision    - ", precision)
print("Recall       - ", recall)
print("F-1 Score    - ", ((2* ( (precision*recall) / (precision + recall))) ))

In [60]:
array = [[tp,fn],
     [fp,tn]]        
df_cm = pd.DataFrame(array, range(2),
                  range(2))
plt.figure(figsize = (10,7))
sns.set(font_scale=1.2)#for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 10}, cmap='YlGnBu', fmt='g')
plt.title('Confusion Matrix for Logistic Regression \n ')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
display()

In [61]:
lr_feature = pd.DataFrame(list(zip(training_df.toPandas()[cols], lr_Model.stages[-1].coefficients.toArray())),
            columns = ['column', 'weight']).sort_values('weight')

##Hyper Parameter Tuning

In [63]:
# Create ParamGrid for Cross Validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid_lr = ParamGridBuilder().addGrid(lr.regParam, [0.01, 0.1]).addGrid(lr.elasticNetParam, [0.0, 0.05]).addGrid(lr.maxIter, [1, 3, 10]).build()

In [64]:
from time import *
start_time = time()

# Create 3-fold CrossValidator
cv_lr = CrossValidator(estimator=lr,
                    estimatorParamMaps=paramGrid_lr,
                    evaluator=evaluator, numFolds=3)

# Run cross validations
cvModel = Pipeline(stages=[va, sc, cv_lr]).fit(training_df)
# likely take a fair amount of time
end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

In [65]:
bestModel = cvModel.stages[-1].bestModel
bestModel.extractParamMap()

In [66]:
reg_lr_Model = Pipeline(stages=[va, sc, bestModel]).fit(training_df)
reg_lr_prediction = reg_lr_Model.transform(testing_df)
reg_lr_prediction.select("prediction", "default", "features").show()

In [67]:
#Accuracy of Logistic Regression
evaluator = BinaryClassificationEvaluator(labelCol = 'default')
reg_lr_accuracy = evaluator.evaluate(reg_lr_Model.transform(validation_df))
print("Accuracy of LogisticRegression is = %g"% (reg_lr_accuracy))
print("Test Error of LogisticRegression = %g " % (1.0 - reg_lr_accuracy))

In [68]:
trainingSummary = reg_lr_Model.stages[-1].summary
roc = trainingSummary.roc.toPandas()
plt.plot(roc['FPR'],roc['TPR'])
plt.ylabel('False Positive Rate')
plt.xlabel('True Positive Rate')
plt.title('ROC Curve')
a = plt.show()
display(a)

In [69]:
pr = trainingSummary.pr.toPandas()
plt.plot(pr['recall'],pr['precision'])
plt.ylabel('Precision')
plt.xlabel('Recall')
b = plt.show()
display(b)

In [70]:
lr_best_feature = pd.DataFrame(list(zip(training_df.toPandas()[cols], reg_lr_Model.stages[-1].coefficients.toArray())),
            columns = ['column', 'weight']).sort_values('weight', ascending=False)

In [71]:
lr_best_feature

Unnamed: 0,column,weight
14,OTHER,1.18557
20,educational,1.080815
27,small_business,0.64745
29,wedding,0.500683
30,1,0.440215
7,B,0.398254
13,NONE,0.392648
6,A,0.291728
8,C,0.179512
5,interest_rate,0.177411


In [72]:
tn = reg_lr_prediction[(reg_lr_prediction.default == 0) & (reg_lr_prediction.prediction == 0)].count()
tp = reg_lr_prediction[(reg_lr_prediction.default == 1) & (reg_lr_prediction.prediction == 1)].count()
fn = reg_lr_prediction[(reg_lr_prediction.default == 1) & (reg_lr_prediction.prediction == 0)].count()
fp = reg_lr_prediction[(reg_lr_prediction.default == 0) & (reg_lr_prediction.prediction == 1)].count()
precision = ((float(tp) / (float(tp) + float(fp))) * 100 )
recall = ((float(tp) / (float(tp) + float(fn))) * 100 )
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Precision    - ", precision)
print("Recall       - ", recall)
print("F-1 Score    - ", ((2* ( (precision*recall) / (precision + recall))) ))

In [73]:
array = [[tp,fn],
     [fp,tn]]        
df_cm = pd.DataFrame(array, range(2),
                  range(2))
plt.figure(figsize = (10,7))
sns.set(font_scale=1.2)#for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 10}, cmap='YlGnBu', fmt='g')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
display()

##Building a classification model using Decision Tree

### Under Sampling the dataset due to highy imbalanced classes

In [76]:
#Undersampling the dataset with good loans to take care of the Imbalanced dataset
good_loans = loan_linear_reg.filter(loan_linear_reg.default == 0)
bad_loans = loan_linear_reg.filter(loan_linear_reg.default == 1)
sampleRatio = bad_loans.count() / loan_linear_reg.count()
good_loansSampleDf = good_loans.sample(False, sampleRatio)
loan_linear_reg1 = bad_loans.unionAll(good_loansSampleDf)

In [77]:
loan_linear_reg1.groupby('default').count().show()

In [78]:
#Splitting the dataset 
training_df1, validation_df1, testing_df1 = loan_linear_reg1.randomSplit([0.6, 0.3, 0.1], seed=100)

In [79]:
vaD = VectorAssembler(inputCols=['total_pymnt','loan_amount','installment','total_rec_prncp','interest_rate','A', 'B', 'C','D','E','F', 'MORTGAGE','NONE','OTHER','OWN','RENT','car','dti','credit_card','debt_consolidation','educational','home_improvement','house','major_purchase','medical','moving','renewable_energy','small_business','vacation','wedding','1'], outputCol='features')
sc = StandardScaler(withMean=True, withStd=False, inputCol='features', outputCol='std_features')
dt = DecisionTreeClassifier(featuresCol='std_features', labelCol='default')

dt_model = Pipeline(stages=[vaD, sc, dt]).fit(training_df1)

In [80]:
evaluator = BinaryClassificationEvaluator(labelCol="default", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
validation_accuracy = evaluator.evaluate(dt_model.transform(validation_df1))
print("Validation Accuracy = %g " % (validation_accuracy))
print("Validation Error = %g " % (1.0 - validation_accuracy))

In [81]:
treeModel = dt_model.stages[-1]
print(treeModel)

In [82]:
print(treeModel.toDebugString)

In [83]:
display(dt_model.stages[-1])

treeNode
"{""index"":15,""featureType"":""continuous"",""prediction"":null,""threshold"":-1.0158764680295747,""categories"":null,""feature"":4,""overflow"":false}"
"{""index"":7,""featureType"":""continuous"",""prediction"":null,""threshold"":-5.015876468029575,""categories"":null,""feature"":4,""overflow"":false}"
"{""index"":5,""featureType"":""continuous"",""prediction"":null,""threshold"":-7.015876468029575,""categories"":null,""feature"":4,""overflow"":false}"
"{""index"":1,""featureType"":""continuous"",""prediction"":null,""threshold"":0.4811554895990584,""categories"":null,""feature"":27,""overflow"":false}"
"{""index"":0,""featureType"":null,""prediction"":0.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":3,""featureType"":""continuous"",""prediction"":null,""threshold"":1633.6584832280032,""categories"":null,""feature"":0,""overflow"":false}"
"{""index"":2,""featureType"":null,""prediction"":1.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":4,""featureType"":null,""prediction"":0.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":6,""featureType"":null,""prediction"":0.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"
"{""index"":13,""featureType"":""continuous"",""prediction"":null,""threshold"":5303.70374587417,""categories"":null,""feature"":3,""overflow"":false}"


In [84]:
dt_prediction = dt_model.transform(testing_df1)
dt_prediction.select("prediction", "default", "features").show()

In [85]:
tn = dt_prediction[(dt_prediction.default == 0) & (dt_prediction.prediction == 0)].count()
tp = dt_prediction[(dt_prediction.default == 1) & (dt_prediction.prediction == 1)].count()
fn = dt_prediction[(dt_prediction.default == 1) & (dt_prediction.prediction == 0)].count()
fp = dt_prediction[(dt_prediction.default == 0) & (dt_prediction.prediction == 1)].count()
precision = ((float(tp) / (float(tp) + float(fp))) * 100 )
recall = ((float(tp) / (float(tp) + float(fn))) * 100 )
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Precision    - ", precision)
print("Recall       - ", recall)
print("F-1 Score    - ", ((2* ( (precision*recall) / (precision + recall))) ))
print("Test_Accuracy- ", evaluator.evaluate(dt_model.transform(testing_df1)))

In [86]:
array = [[tp,fn],
     [fp,tn]]        
df_cm = pd.DataFrame(array, range(2), range(2))
plt.figure(figsize = (10,7))
sns.set(font_scale=1.2)#for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 10}, cmap='YlGnBu', fmt='g')
plt.title('Confusion Matrix for Decision Tree - Default model using Under Sampling \n ')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
display()

## Hyper Parameter Tuning

In [88]:
# Create ParamGrid for Cross Validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid_dt = (ParamGridBuilder()
                .addGrid(dt.maxDepth, [15, 20])
                .addGrid(dt.maxBins, [30, 40, 50])
             .build())

In [89]:
from time import *
start_time = time()

evaluatorPR = BinaryClassificationEvaluator(labelCol = "default", metricName = "areaUnderROC")
# Create 3-fold CrossValidator
cv_dt = CrossValidator(estimator=dt,
                    estimatorParamMaps=paramGrid_dt,
                    evaluator=evaluatorPR, numFolds=3)

# Run cross validations
cvModel = Pipeline(stages=[vaD, sc, cv_dt]).fit(training_df1)
# likely take a fair amount of time
end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

In [90]:
dt_bestModel = cvModel.stages[-1].bestModel
dt_bestModel.extractParamMap()

In [91]:
tuned_dt_Model = Pipeline(stages=[vaD, sc, dt_bestModel]).fit(training_df1)
tuned_dt_prediction = tuned_dt_Model.transform(testing_df1)
tuned_dt_prediction.select("prediction", "default", "features").show()

In [92]:
evaluator = BinaryClassificationEvaluator(labelCol="default", metricName="areaUnderPR")
validation_accuracy = evaluator.evaluate(tuned_dt_Model.transform(validation_df1))
print("Validation Accuracy = %g " % (validation_accuracy))
print("Validation Error = %g " % (1.0 - validation_accuracy))

In [93]:
treeModel = tuned_dt_Model.stages[-1]
print(treeModel)

In [94]:
print(treeModel.toDebugString)

In [95]:
display(dt_bestModel)

treeNode
"{""index"":8319,""featureType"":""continuous"",""prediction"":null,""threshold"":-1.0158764680295747,""categories"":null,""feature"":4,""overflow"":false}"
"{""index"":2441,""featureType"":""continuous"",""prediction"":null,""threshold"":-5.015876468029575,""categories"":null,""feature"":4,""overflow"":false}"
"{""index"":559,""featureType"":""continuous"",""prediction"":null,""threshold"":-7.015876468029575,""categories"":null,""feature"":4,""overflow"":false}"
"{""index"":543,""featureType"":""continuous"",""prediction"":null,""threshold"":0.4811554895990584,""categories"":null,""feature"":27,""overflow"":false}"
"{""index"":87,""featureType"":""continuous"",""prediction"":null,""threshold"":-9.015876468029575,""categories"":null,""feature"":4,""overflow"":false}"
"{""index"":11,""featureType"":""continuous"",""prediction"":null,""threshold"":-309.5788066422743,""categories"":null,""feature"":2,""overflow"":false}"
"{""index"":9,""featureType"":""continuous"",""prediction"":null,""threshold"":0.5866233400711458,""categories"":null,""feature"":17,""overflow"":false}"
"{""index"":7,""featureType"":""continuous"",""prediction"":null,""threshold"":0.4896630248445616,""categories"":null,""feature"":24,""overflow"":false}"
"{""index"":1,""featureType"":""continuous"",""prediction"":null,""threshold"":-6445.341516771997,""categories"":null,""feature"":0,""overflow"":false}"
"{""index"":0,""featureType"":null,""prediction"":0.0,""threshold"":null,""categories"":null,""feature"":null,""overflow"":false}"


In [96]:
tn = tuned_dt_prediction[(tuned_dt_prediction.default == 0) & (tuned_dt_prediction.prediction == 0)].count()
tp = tuned_dt_prediction[(tuned_dt_prediction.default == 1) & (tuned_dt_prediction.prediction == 1)].count()
fn = tuned_dt_prediction[(tuned_dt_prediction.default == 1) & (tuned_dt_prediction.prediction == 0)].count()
fp = tuned_dt_prediction[(tuned_dt_prediction.default == 0) & (tuned_dt_prediction.prediction == 1)].count()
precision = ((float(tp) / (float(tp) + float(fp))) * 100 )
recall = ((float(tp) / (float(tp) + float(fn))) * 100 )
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Precision    - ", precision)
print("Recall       - ", recall)
print("F-1 Score    - ", ((2* ( (precision*recall) / (precision + recall))) ))
print("Test_Accuracy- ", evaluator.evaluate(tuned_dt_Model.transform(testing_df1)))

In [97]:
array = [[tp,fn],
     [fp,tn]]        
df_cm = pd.DataFrame(array, range(2),
                  range(2))
plt.figure(figsize = (10,7))
sns.set(font_scale=1.2)#for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 10}, cmap='YlGnBu', fmt='g')
plt.title('Confusion Matrix for Decision Tree - Grid Search model using Under Sampling \n ')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
display()

##Trying Startified Sampling

In [99]:
stratified_data = loan_linear_reg.sampleBy('default', fractions={1: 0.9, 0: 0.40}).cache()

stratified_data.groupby('default').count().show()

In [100]:
#Splitting the dataset 
training_df1, validation_df1, testing_df1 = stratified_data.randomSplit([0.6, 0.3, 0.1], seed=100)

In [101]:
vaD = VectorAssembler(inputCols=['total_pymnt','loan_amount','installment','total_rec_prncp','interest_rate','A', 'B', 'C','D','E','F', 'MORTGAGE','NONE','OTHER','OWN','RENT','car','dti','credit_card','debt_consolidation','educational','home_improvement','house','major_purchase','medical','moving','renewable_energy','small_business','vacation','wedding','1'], outputCol='features')
sc = StandardScaler(withMean=True, withStd=False, inputCol='features', outputCol='std_features')
dt = DecisionTreeClassifier(featuresCol='std_features', labelCol='default')

dt_model = Pipeline(stages=[vaD, sc, dt]).fit(training_df1)

In [102]:
evaluator = BinaryClassificationEvaluator(labelCol="default", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
validation_accuracy = evaluator.evaluate(dt_model.transform(validation_df1))
print("Validation Accuracy = %g " % (validation_accuracy))
print("Validation Error = %g " % (1.0 - validation_accuracy))

In [103]:
dt_prediction = dt_model.transform(testing_df1)
dt_prediction.select("prediction", "default", "features").show()

In [104]:
tn = dt_prediction[(dt_prediction.default == 0) & (dt_prediction.prediction == 0)].count()
tp = dt_prediction[(dt_prediction.default == 1) & (dt_prediction.prediction == 1)].count()
fn = dt_prediction[(dt_prediction.default == 1) & (dt_prediction.prediction == 0)].count()
fp = dt_prediction[(dt_prediction.default == 0) & (dt_prediction.prediction == 1)].count()
precision = ((float(tp) / (float(tp) + float(fp))) * 100 )
recall = ((float(tp) / (float(tp) + float(fn))) * 100 )
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Precision    - ", precision)
print("Recall       - ", recall)
print("F-1 Score    - ", ((2* ( (precision*recall) / (precision + recall))) ))
print("Test_Accuracy- ", evaluator.evaluate(dt_model.transform(testing_df1)))

In [105]:
array = [[tp,fn],
     [fp,tn]]        
df_cm = pd.DataFrame(array, range(2),
                  range(2))
plt.figure(figsize = (10,7))
sns.set(font_scale=1.2)#for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 10}, cmap='YlGnBu', fmt='g')

plt.title('Confusion Matrix for Decision Tree - Default model using Stratified Sampling \n ')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
display()

##Hyper Parameter Tuning

In [107]:
# Create ParamGrid for Cross Validation
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid_dt = (ParamGridBuilder()
                .addGrid(dt.maxDepth, [10, 20, 30])
                .addGrid(dt.maxBins, [50, 60, 70])
             .build())

In [108]:
from time import *
start_time = time()

evaluatorPR = BinaryClassificationEvaluator(labelCol = "default", metricName = "areaUnderROC")
# Create 3-fold CrossValidator
cv_dt = CrossValidator(estimator=dt,
                    estimatorParamMaps=paramGrid_dt,
                    evaluator=evaluatorPR, numFolds=3)

# Run cross validations
cvModel = Pipeline(stages=[vaD, sc, cv_dt]).fit(training_df1)
# likely take a fair amount of time
end_time = time()
elapsed_time = end_time - start_time
print("Time to train model: %.3f seconds" % elapsed_time)

In [109]:
dt_bestModel = cvModel.stages[-1].bestModel
dt_bestModel.extractParamMap()

In [110]:
tuned_dt_Model = Pipeline(stages=[vaD, sc, dt_bestModel]).fit(training_df1)

In [111]:
tuned_dt_prediction = tuned_dt_Model.transform(testing_df1)
tuned_dt_prediction.select("prediction", "default", "features").show()

In [112]:
evaluator = BinaryClassificationEvaluator(labelCol="default", metricName="areaUnderPR")
validation_accuracy = evaluator.evaluate(tuned_dt_Model.transform(validation_df1))
print("Validation Accuracy = %g " % (validation_accuracy))
print("Validation Error = %g " % (1.0 - validation_accuracy))

In [113]:
treeModel = tuned_dt_Model.stages[-1]
print(treeModel)

In [114]:
print(treeModel.toDebugString)

In [115]:
display(dt_bestModel)

treeNode
"{""index"":21727,""featureType"":""continuous"",""prediction"":null,""threshold"":0.023764425569817504,""categories"":null,""feature"":4,""overflow"":false}"
"{""index"":7051,""featureType"":""continuous"",""prediction"":null,""threshold"":-3.9762355744301825,""categories"":null,""feature"":4,""overflow"":false}"
"{""index"":2153,""featureType"":""continuous"",""prediction"":null,""threshold"":-5.9762355744301825,""categories"":null,""feature"":4,""overflow"":false}"
"{""index"":2123,""featureType"":""continuous"",""prediction"":null,""threshold"":0.4869425291296876,""categories"":null,""feature"":27,""overflow"":false}"
"{""index"":855,""featureType"":""continuous"",""prediction"":null,""threshold"":-146.45990195112154,""categories"":null,""feature"":2,""overflow"":false}"
"{""index"":745,""featureType"":""continuous"",""prediction"":null,""threshold"":-1556.7925096313384,""categories"":null,""feature"":3,""overflow"":false}"
"{""index"":737,""featureType"":""continuous"",""prediction"":null,""threshold"":-2937.867910418963,""categories"":null,""feature"":0,""overflow"":false}"
"{""index"":527,""featureType"":""continuous"",""prediction"":null,""threshold"":0.2728162911983133,""categories"":null,""feature"":18,""overflow"":false}"
"{""index"":61,""featureType"":""continuous"",""prediction"":null,""threshold"":-369.45990195112154,""categories"":null,""feature"":2,""overflow"":false}"
"{""index"":43,""featureType"":""continuous"",""prediction"":null,""threshold"":-4608.792509631338,""categories"":null,""feature"":3,""overflow"":false}"


In [116]:
tn = tuned_dt_prediction[(tuned_dt_prediction.default == 0) & (tuned_dt_prediction.prediction == 0)].count()
tp = tuned_dt_prediction[(tuned_dt_prediction.default == 1) & (tuned_dt_prediction.prediction == 1)].count()
fn = tuned_dt_prediction[(tuned_dt_prediction.default == 1) & (tuned_dt_prediction.prediction == 0)].count()
fp = tuned_dt_prediction[(tuned_dt_prediction.default == 0) & (tuned_dt_prediction.prediction == 1)].count()
precision = ((float(tp) / (float(tp) + float(fp))) * 100 )
recall = ((float(tp) / (float(tp) + float(fn))) * 100 )
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Precision    - ", precision)
print("Recall       - ", recall)
print("F-1 Score    - ", ((2* ( (precision*recall) / (precision + recall))) ))
print("Test_Accuracy- ", evaluator.evaluate(tuned_dt_Model.transform(testing_df1)))

In [117]:
array = [[tp,fn],
     [fp,tn]]        
df_cm = pd.DataFrame(array, range(2),
                  range(2))
plt.figure(figsize = (10,7))
sns.set(font_scale=1.2)#for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 10}, cmap='YlGnBu', fmt='g')
plt.title('Confusion Matrix for Decision Tree - Grid Search model using Stratified Sampling \n ')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
display()

In [118]:
#Feature Importance of Decision Tree
dt_features = pd.DataFrame(list(zip(tuned_dt_prediction.toPandas()[cols], dt_model.stages[-1].featureImportances.toArray())),
            columns = ['feature', 'importance']).sort_values('importance', ascending = False)

##Building a classification model using Random Forest

In [120]:
rf_assembler = VectorAssembler(inputCols=['loan_amount','dti','total_pymnt','total_rec_prncp','recoveries','installment','interest_rate','A','B','C','D','E','F','MORTGAGE','NONE','OTHER','OWN','RENT','High','Medium','car','credit_card','debt_consolidation','educational','home_improvement','house','major_purchase','medical','moving','renewable_energy','small_business','vacation','wedding','1'], outputCol="features")
rf = classification.RandomForestClassifier(featuresCol = "features", labelCol = "default")

In [121]:
pipe_rf = Pipeline(stages = [rf_assembler,rf]).fit(training_df1)

In [122]:
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='default', metricName='areaUnderROC')

In [123]:
AUC = evaluator.evaluate(pipe_rf.transform(validation_df1))
print(AUC)

In [124]:
rf_prediction = pipe_rf.transform(testing_df1)
rf_prediction.select("prediction", "default", "features").show()

In [125]:
tn = rf_prediction[(rf_prediction.default == 0) & (rf_prediction.prediction == 0)].count()
tp = rf_prediction[(rf_prediction.default == 1) & (rf_prediction.prediction == 1)].count()
fn = rf_prediction[(rf_prediction.default == 1) & (rf_prediction.prediction == 0)].count()
fp = rf_prediction[(rf_prediction.default == 0) & (rf_prediction.prediction == 1)].count()
precision = ((float(tp) / (float(tp) + float(fp))) * 100 )
recall = ((float(tp) / (float(tp) + float(fn))) * 100 )
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Precision    - ", precision)
print("Recall       - ", recall)
print("F-1 Score    - ", ((2* ( (precision*recall) / (precision + recall))) ))
print("Test_Accuracy- ", evaluator.evaluate(pipe_rf.transform(testing_df1)))

In [126]:
array = [[tp,fn],
     [fp,tn]]        
df_cm = pd.DataFrame(array, range(2),
                  range(2))
plt.figure(figsize = (10,7))
sns.set(font_scale=1.2)#for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 10}, cmap='YlGnBu', fmt='g')
plt.title('Confusion Matrix for Random Forest - Default model using Stratified Sampling \n ')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
display()

##Hyper Parameter Tuning

In [128]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
paramGrid = (ParamGridBuilder()
             .addGrid(rf.numTrees, [25,30,35])
             .addGrid(rf.maxDepth, [6,10,12])
             .build())

In [129]:
rf_evaluator = BinaryClassificationEvaluator(labelCol='default', metricName = 'areaUnderROC')

In [130]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
cv_rf= CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=rf_evaluator, numFolds=3)
    
# Run cross validations
cv_rf_Model = Pipeline(stages = [rf_assembler, cv_rf]).fit(training_df1)

In [131]:
print("The area under ROC for validation set after CV  is {}".format(rf_evaluator.evaluate(cv_rf_Model.transform(validation_df1))))

In [132]:
best_model = cv_rf_Model.stages[-1].bestModel
best_model.extractParamMap()

In [133]:
print("The area under ROC for testing set after CV  is {}".format(rf_evaluator.evaluate(cv_rf_Model.transform(testing_df1))))

In [134]:
rf_model = Pipeline(stages = [rf_assembler, best_model]).fit(training_df1)

In [135]:
rf_prediction = rf_model.transform(testing_df1)
rf_prediction.select("prediction", "default", "features").show()

In [136]:
tn = rf_prediction[(rf_prediction.default == 0) & (rf_prediction.prediction == 0)].count()
tp = rf_prediction[(rf_prediction.default == 1) & (rf_prediction.prediction == 1)].count()
fn = rf_prediction[(rf_prediction.default == 1) & (rf_prediction.prediction == 0)].count()
fp = rf_prediction[(rf_prediction.default == 0) & (rf_prediction.prediction == 1)].count()
precision = ((float(tp) / (float(tp) + float(fp))) * 100 )
recall = ((float(tp) / (float(tp) + float(fn))) * 100 )
print("True Positives:", tp)
print("True Negatives:", tn)
print("False Positives:", fp)
print("False Negatives:", fn)
print("Precision    - ", precision)
print("Recall       - ", recall)
print("F-1 Score    - ", ((2* ( (precision*recall) / (precision + recall))) ))
print("Test_Accuracy- ", evaluator.evaluate(rf_model.transform(testing_df1)))

In [137]:
array = [[tp,fn],
     [fp,tn]]        
df_cm = pd.DataFrame(array, range(2),
                  range(2))
plt.figure(figsize = (10,7))
sns.set(font_scale=1.2)#for label size
sns.heatmap(df_cm, annot=True, annot_kws={"size": 10}, cmap='YlGnBu', fmt='g')
plt.title('Confusion Matrix for Random Forest - Grid Search model using Stratified Sampling \n ')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.gca().invert_xaxis()
plt.gca().invert_yaxis()
display()

In [138]:
rf_features = pd.DataFrame(list(zip(rf_prediction.toPandas()[cols], rf_model.stages[-1].featureImportances.toArray())),
            columns = ['feature', 'importance']).sort_values('importance', ascending = False)

In [139]:
rf_features

Unnamed: 0,feature,importance
4,total_rec_prncp,0.508058
6,A,0.149545
3,installment,0.120103
2,dti,0.052705
7,B,0.032343
0,total_pymnt,0.029339
5,interest_rate,0.026703
8,C,0.013564
1,loan_amount,0.011916
11,F,0.009608


#Inference

In [141]:
plt.figure(figsize=(18,7))
ax = sns.barplot(x="feature", y="importance", data=rf_features.head(10), saturation=.5)
ax.set_title('Feature Importance by Random Forest')
ax.set_ylabel('Feature Importance')
ax.set_xlabel('Features')
display(ax.figure)

In [142]:
plt.figure(figsize=(18,7))
ax = sns.barplot(x="feature", y="importance", data=dt_features.head(6), saturation=.5)
ax.set_title('Feature Importance by Decision Tree')
ax.set_ylabel('Feature Importance')
ax.set_xlabel('Features')
display(ax.figure)

In [143]:
plt.figure(figsize=(18,7))
ax = sns.barplot(x="column", y="weight", data=lr_best_feature.head(10), saturation=.5)
#ax.set_xticklabels(ax.get_xticklabels(), rotation=20)
ax.set_title('Feature Importance by Logistic Regression')
ax.set_ylabel('Feature Weights')
ax.set_xlabel('Features')
display(ax.figure)

In [144]:
name = ['Logistic Regression', 'Decision Tree', 'Random Forest']
Accuracy = [0.79, 0.727, 0.855]
result_df = pd.DataFrame(list(zip(name, Accuracy)), columns =['Model', 'Accuracy'])
result_df.head()

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.79
1,Decision Tree,0.727
2,Random Forest,0.855


#Model Comparison

In [146]:
plt.figure(figsize=(7,5))
ax = sns.barplot(x="Model", y="Accuracy", data=result_df, saturation=.5)
#ax.set_xticklabels(ax.get_xticklabels(), rotation=20)
ax.set_title('Model Comparison')
ax.set_ylabel('Model Accuracy')
ax.set_xlabel('Models Implemented')
display(ax.figure)