## DecisionTreeClassifier

In [None]:
# pip install imblearn
# pip install pydotplus

In [None]:
# Importing Packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OrdinalEncoder,LabelEncoder,MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import pydotplus

In [None]:
# Importing Dataset

app_df = pd.read_csv("application_record.csv")
cr_df = pd.read_csv("credit_record.csv")

In [None]:
# Display Top 5 Rows Of Application Record

app_df.head()

In [None]:
# Display Top 5 Rows Of Credit Record

cr_df.head()

In [None]:
# Check Shape of Application Record

app_df.shape

In [None]:
# Check Shape of Credit Record

cr_df.shape

In [None]:
# Preparing The Target Variable
# 1. For customers having due for multiple months,the maximum number of months due is taken.
# 2. Any ID with greater than equal to 1 months due is marked as Class'1'

cr_df = cr_df.sort_values(['ID','MONTHS_BALANCE'],ascending = False)
cr_df = cr_df.groupby("ID").agg(max).reset_index()

In [None]:
# Status : 0 => 1-29 days overdue;1 => 30-59 days overdue;2 => 60-89 days overdue;3 => 90-119 days overdue;4 => 120-149 
# days overdue;5 => overdue or bad debts,write offs for more than 150 days;C => paid off that month;X => No loan for the 
# month.

cr_df["STATUS"].replace({"C":0,"X":0},inplace=True)
cr_df["STATUS"] = cr_df["STATUS"].astype("int")
cr_df["STATUS"] = cr_df["STATUS"].apply(lambda x:1 if x>0 else 0)

In [None]:
# Merging Both Application And Credit Status Datasets

df = app_df.join(cr_df.set_index("ID"),on="ID",how="inner")

In [None]:
# Information About Dataset

df.info()

In [None]:
# Get Null Value From The Dataset

df.isnull().sum()

In [None]:
# Display Top 5 Rows Of Dataset

df.head()

In [None]:
# Different Type Of Features Based On Data Type

binary_features = ["CODE_GENDER","FLAG_OWN_CAR","FLAG_OWN_REALTY","FLAG_WORK_PHONE","FLAG_EMAIL"]
continuous_features = ["CNT_CHILDREN","AMT_INCOME_TOTAL","DAYS_BIRTH","DAYS_EMPLOYED","CNT_FAM_MEMBERS"]
cat_features = ["NAME_INCOME_TYPE","NAME_EDUCATION_TYPE","NAME_FAMILY_STATUS","NAME_HOUSING_TYPE"]

In [None]:
# Dropping Features
# 1. OCCUPATION_TYPE - Lot Of Missing Data (11323 Data Points)
# 2. FLAG_MOBIL - All Datapoints Have The Flag Set To 1

df.drop(["MONTHS_BALANCE","OCCUPATION_TYPE","FLAG_MOBIL"],inplace=True,axis=1)

In [None]:
# Ordering Ordinal Values And Encoding : We Have Determined The Order Of Income Type (A Qualitative Call) And Created The
# Encoding Of The Variable "NAME_INCOME_TYPE"

encoder = OrdinalEncoder(categories=[["Student","Pensioner","Working","Commercial associate","State servant"]])
df["NAME_INCOME_TYPE"] = encoder.fit_transform(df["NAME_INCOME_TYPE"].values.reshape(-1,1))

In [None]:
# Encoding Ordinal Columns Without Ordering

encoder2 = LabelEncoder()
for i in cat_features[1:]:
    df[i] = encoder2.fit_transform(df[i].values.reshape(-1,1))

In [None]:
# For Binary Features We Had Used Pandas Dummies For One Hot Encoding

df = pd.get_dummies(df,columns=binary_features,drop_first=True)

In [None]:
# Plotting Different Variables To Get An Understanding Of Outlier Present

fig,ax = plt.subplots(nrows=2,ncols=3,figsize=(14,6))
sns.scatterplot(x="ID",y="CNT_CHILDREN",data=df,ax=ax[0][0],color="orange")
sns.scatterplot(x="ID",y="AMT_INCOME_TOTAL",data=df,ax=ax[0][1],color="orange")
sns.scatterplot(x="ID",y="DAYS_BIRTH",data=df,ax=ax[0][2])
sns.scatterplot(x="ID",y="DAYS_EMPLOYED",data=df,ax=ax[1][0])
sns.scatterplot(x="ID",y="CNT_FAM_MEMBERS",data=df,ax=ax[1][1],color="orange")

In [None]:
# Removing Outlier Using IQR Technique

q3 = df["AMT_INCOME_TOTAL"].quantile(.75)
q1 = df["AMT_INCOME_TOTAL"].quantile(.25)
IQR = q3 - q1
lower_range = q1 - 1.5 * IQR
upper_range = q3 + 1.5 * IQR
df = df[(df["AMT_INCOME_TOTAL"] > lower_range) & (df["AMT_INCOME_TOTAL"] < upper_range)]
df = df[df["CNT_CHILDREN"] < 8]
df = df[df["CNT_FAM_MEMBERS"] < 8]

In [None]:
# Plotting Post Removal Of Outlier

fig,ax = plt.subplots(nrows=2,ncols=3,figsize=(14,6))
sns.scatterplot(x="ID",y="CNT_CHILDREN",data=df,ax=ax[0][0],color="orange")
sns.scatterplot(x="ID",y="AMT_INCOME_TOTAL",data=df,ax=ax[0][1],color="orange")
sns.scatterplot(x="ID",y="DAYS_BIRTH",data=df,ax=ax[0][2])
sns.scatterplot(x="ID",y="DAYS_EMPLOYED",data=df,ax=ax[1][0])
sns.scatterplot(x="ID",y="CNT_FAM_MEMBERS",data=df,ax=ax[1][1],color="orange")

In [None]:
# Train Test Split

Y = df["STATUS"]
X = df.drop("STATUS",axis=1)

In [None]:
# Split The Data Into Training And Testing

X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2)
mms = MinMaxScaler()
X_scaled = pd.DataFrame(mms.fit_transform(X_train),columns=X_train.columns)
sm = SMOTE(random_state=42)
X_train_res,Y_train_res = sm.fit_resample(X_scaled,Y_train)

In [None]:
# Checking The Shape Of New Dataset

unique,counts = np.unique(Y_train_res,return_counts=True)
np.asarray((unique,counts))

In [None]:
# Create DecisionTreeClassifier

classifier = DecisionTreeClassifier()
model = classifier.fit(X_train_res,Y_train_res)
X_test_scaled = mms.transform(X_test)
prediction = model.predict(X_test_scaled)

In [None]:
# Check Model Performance

print("Accuracy Score : {:.5}".format(accuracy_score(Y_test,prediction)))
print(pd.DataFrame(confusion_matrix(Y_test,prediction)))

In [None]:
# Display Classification Report

print(classification_report(Y_test,prediction))

In [None]:
# Display Feature Importance

feats = {}
for feature,importance in zip(X_train.columns,model.feature_importances_):
    feats[feature] = importance
importances = pd.DataFrame().from_dict(feats,orient="index").rename(columns={0:"Gini-Importance"})
importances.sort_values(by="Gini-Importance").plot(kind="bar",rot=90)