In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE, BorderlineSMOTE, ADASYN
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
import pydotplus
from sklearn.metrics import classification_report
from google.colab import files
app_df = pd.read_csv('application_record.csv')
cr_df = pd.read_csv('credit_record.csv')
app_df.head()
cr_df.shape
cr_df['MONTHS_BALANCE'].value_counts()
cr_df = cr_df.sort_values(['ID','MONTHS_BALANCE'], ascending=False)
cr_df = cr_df.groupby('ID').agg(max).reset_index()
cr_df['STATUS'].value_counts()
cr_df['STATUS'].replace({'C': 0, 'X' : 0}, inplace=True)
cr_df['STATUS'] = cr_df['STATUS'].astype('int')
cr_df['STATUS'] = cr_df['STATUS'].apply(lambda x:1 if x > 0 else 0)
cr_df['STATUS'].value_counts()
df = app_df.join(cr_df.set_index('ID'), on='ID', how='inner')
df.info()
df.isnull().sum()
df['STATUS'].value_counts()
binary_features = ['CODE_GENDER','FLAG_OWN_CAR','FLAG_OWN_REALTY','FLAG_WORK_PHONE','FLAG_EMAIL']
continous_features = ['CNT_CHILDREN','AMT_INCOME_TOTAL','DAYS_BIRTH','DAYS_EMPLOYED','CNT_FAM_MEMBERS']
cat_features = ['NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE']
df.drop(['MONTHS_BALANCE','OCCUPATION_TYPE','FLAG_MOBIL'], axis=1, inplace=True)
df['NAME_INCOME_TYPE'].unique()
encoder2 = LabelEncoder()

# No specific order for other Categorical Variables
cat_features = ['NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE']
for i in cat_features:
  df[i] = encoder2.fit_transform(df[i].values.reshape(-1, 1))

#Getting dummies using Pandas
df = pd.get_dummies(df, columns=binary_features, drop_first=True)

# Plotting different Variables
fig, ax= plt.subplots(nrows= 2, ncols = 3, figsize= (14,6))

sns.scatterplot(x='ID', y='CNT_CHILDREN', data=df, ax=ax[0][0], color= 'orange')
sns.scatterplot(x='ID', y='AMT_INCOME_TOTAL', data=df, ax=ax[0][1], color='orange')
sns.scatterplot(x='ID', y='DAYS_BIRTH', data=df, ax=ax[0][2])
sns.scatterplot(x='ID', y='DAYS_EMPLOYED', data=df, ax=ax[1][0])
sns.scatterplot(x='ID', y='CNT_FAM_MEMBERS', data=df, ax=ax[1][1], color= 'orange')

q_hi = df['AMT_INCOME_TOTAL'].quantile(0.75)
q_low = df['AMT_INCOME_TOTAL'].quantile(0.25)
IQR = q_hi - q_low
lower_range = q_low - (1.5 * IQR)
upper_range = q_hi + (1.5 * IQR)
df= df[(df['AMT_INCOME_TOTAL']>lower_range) & (df['AMT_INCOME_TOTAL']<upper_range)]
df= df[(df['CNT_CHILDREN'] < 8)]
df= df[(df['CNT_FAM_MEMBERS'] < 8)]


# Plotting post removal of outliers
fig, ax= plt.subplots(nrows= 2, ncols = 3, figsize= (14,6))

sns.scatterplot(x='ID', y='CNT_CHILDREN', data=df, ax=ax[0][0], color= 'orange')
sns.scatterplot(x='ID', y='AMT_INCOME_TOTAL', data=df, ax=ax[0][1], color='orange')
sns.scatterplot(x='ID', y='DAYS_BIRTH', data=df, ax=ax[0][2])
sns.scatterplot(x='ID', y='DAYS_EMPLOYED', data=df, ax=ax[1][0])
sns.scatterplot(x='ID', y='CNT_FAM_MEMBERS', data=df, ax=ax[1][1], color= 'orange')


Train Test Split:

1. Seperating Target Variable from the dataset
2. Creating Train Test Split
3. Transforming the data
4. Using SMOTE to counter imbalance in the data



In [None]:
new_cols = [col for col in df.columns if col != 'STATUS'] + ['STATUS']
df = df[new_cols]


X = df.iloc[:,1:-1]
y = df.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
mms = MinMaxScaler()
X_scaled = pd.DataFrame(mms.fit_transform(X_train), columns=X_train.columns)

sm = SMOTE(random_state = 42)
X_train_res, y_train_res = sm.fit_sample(X_scaled, y_train)

In [None]:
y_train.value_counts()
unique, counts = np.unique(y_train_res, return_counts=True)
print (np.asarray((unique, counts)).T)

Defining the DecisionTreeClassifier Model

In [None]:
classifier = DecisionTreeClassifier()
model = classifier.fit(X_train_res, y_train_res)
X_test_scaled = mms.transform(X_test)
prediction = model.predict(X_test_scaled)
print('Accuracy Score is {:.5}'.format(accuracy_score(y_test, prediction)))
print(pd.DataFrame(confusion_matrix(y_test,prediction)))
print(classification_report(y_test, prediction))

Feature Importance

In [None]:
feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(X_train.columns, model.feature_importances_):
    feats[feature] = importance #add the name/value pair 

importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
importances.sort_values(by='Gini-importance').plot(kind='bar', rot=90)
model.feature_importances_
importances = model.feature_importances_
indices = np.argsort(importances)

Download Decision Tree Image

In [None]:
dot_data = StringIO()
export_graphviz(model, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = X.columns,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) 
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('Decision_Tree.png')
files.download('Decision_Tree.png')