In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.tree import DecisionTreeClassifier

In [None]:
credit_df = pd.read_csv("credit.csv")

In [None]:
credit_df.head(10)  

In [None]:
credit_df.info()

In [None]:
credit_df.describe()

In [None]:
credit_df.shape

In [None]:
credit_df['default'].value_counts()

In [None]:
# This is only for discussion about how encoding will be done
np.sort(credit_df['checking_balance'].unique())

In [None]:
credit_df.info()  # many columns are of type object i.e. strings. These need to be converted to ordinal type

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:

for feature in credit_df.columns: # Loop through all columns in the dataframe
    if credit_df[feature].dtype == 'object': # Only apply for columns with categorical strings
        credit_df[feature] = le.fit_transform(credit_df[feature].astype(str))
        print(feature)
        print(dict(zip(le.classes_, le.transform(le.classes_))))
        print()
            

In [None]:
#for feature in credit_df.columns: # Loop through all columns in the dataframe
#    if credit_df[feature].dtype == 'object': # Only apply for columns with categorical strings
#        credit_df[feature] = pd.Categorical(credit_df[feature]).codes # Replace strings with an integer
    

In [None]:
# Multiple methods exist
# Label Encoder  

# OR

# manually encoding as shown below ***** this has certain advantages 
# credit_df['checking_balance'] = credit_df['checking_balance'].replace({'< 0 DM':0, '1 - 200 DM':1, '> 200 DM':2, 'unknown':3 })


# Get dummies can also be used it will have different impact on the model


In [None]:
credit_df.info()

In [None]:
credit_df.head(10)

In [None]:
# Putting feature variable to X
X = credit_df.drop('default',axis=1)

# Putting response variable to y
y = credit_df['default']

In [None]:
# Splitting the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state = 0)

In [None]:
# invoking the decision tree classifier function. 
#Restricting the depth of the tree to 10 (no particular reason for selecting this)

                                  
dt_model = DecisionTreeClassifier( max_depth = 10, random_state = 0 )

In [None]:
dt_model.fit(X_train, y_train)

In [None]:
# Making predictions
y_pred = dt_model.predict(X_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Printing confusion matrix and accuracy
print(accuracy_score(y_test,y_pred))
print(confusion_matrix(y_test,y_pred))

In [None]:
# Printing classification report
print(classification_report(y_test, y_pred))

In [None]:
# Check training and testing score
# Training Score
print('Training score')
print(dt_model.score(X_train,y_train))
print('Test score')
print(dt_model.score(X_test , y_test))

In [None]:

# Features importance in the tree building ( The importance of a feature is computed as the 
#(normalized) total reduction of the criterion brought by that feature)

print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))

In [None]:
class_label = ['No', 'Yes']

#### Following is Not useful to visualize large trees

In [None]:
# *****
# ***** This code can be used to print small trees. Large trees are not properly visible *****
# ***** Therefore the code is commented
# *****

#from sklearn.tree import plot_tree
#plt.figure(figsize=(70,35))
#a=plot_tree(dt_model,feature_names=list(X_train), class_names = list(class_label),fontsize=6)

In [None]:
# Visualize tree  using Graphviz website 

from IPython.display import Image  
#import pydotplus as pydot
from sklearn import tree
from os import system

Credit_Tree_File = open('credit_tree_1.dot','w')
dot_data = tree.export_graphviz(dt_model, out_file=Credit_Tree_File, feature_names = list(X_train), class_names = list(class_label))
#dot_data = tree.export_graphviz(dt_model, out_file=Credit_Tree_File, feature_names = list(X_train))
Credit_Tree_File.close()


http://webgraphviz.com/

# Hyperparameter tuning


In [None]:
#With Hyper Parameters Tuning
from sklearn.model_selection import GridSearchCV

#making the instance
model= DecisionTreeClassifier(random_state=0)

# specify number of folds for k-fold CV
n_folds = 5

# hyper parameters to build the model on
params = {'max_depth': range(1, 20)}

#Making models with hyper parameters sets
model1 = GridSearchCV(model, param_grid=params, cv=n_folds, return_train_score=True)
#Learning
model1.fit(X_train, y_train)

In [None]:
# Capture scores from Grid search
scores = model1.cv_results_


In [None]:
# Plot accuracy against max_depth
plt.figure()
plt.plot(scores["param_max_depth"], 
         scores["mean_train_score"], 
         label="training accuracy")
plt.plot(scores["param_max_depth"], 
         scores["mean_test_score"], 
         label="test accuracy")
plt.xlabel("max_depth")
plt.ylabel("Accuracy")
plt.legend()
plt.show()

In [None]:
# Get the best hyperparameter
print("Best Hyper Parameters:",model1.best_params_)
print("Best Score:",model1.best_score_)

### Tune multiple papameters

In [None]:
# Tune MULTIPLE parameters with Hyper Parameters Tuning

#create an instance
model= DecisionTreeClassifier(random_state=0)

# hyper parameters to build the model on
params = {'criterion':('gini','entropy'), 
          'max_depth': range(2, 10),
         'min_samples_split': range(2, 10)
         }

#Making models with hyper parameters sets
model1 = GridSearchCV(model, cv=5, param_grid=params)
#Learning
model1.fit(X_train, y_train)

In [None]:
print("Best Hyper Parameters:",model1.best_params_)
print("Best Score:",model1.best_score_)

# Ensemble Techniques

In [None]:
dt_model = DecisionTreeClassifier( max_depth = 5, random_state=0)
dt_model.fit(X_train, y_train)

In [None]:

#print (pd.DataFrame(dt_model.feature_importances_, columns = ["Imp"], index = X_train.columns))


In [None]:
y_pred = dt_model.predict(X_test)

In [None]:
dt_model.score(X_test , y_test)

## Ensemble learning - Bagging Classifier 

In [None]:
from sklearn.ensemble import BaggingClassifier

bgcl = BaggingClassifier(base_estimator=dt_model, n_estimators=20, random_state=10)

bgcl = bgcl.fit(X_train, y_train)


In [None]:
y_pred = bgcl.predict(X_test)
bgcl.score(X_test, y_test)

## Ensemble  learning - RandomForest Classifier

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfcl = RandomForestClassifier(n_estimators = 20, random_state=0)
rfcl = rfcl.fit(X_train, y_train)


In [None]:
test_pred = rfcl.predict(X_test)
rfcl.score(X_test , y_test)

# Hyperparameter tuning of RandomForest

In [None]:
# Create the parameter grid based on the results of random search 
param_grid = {
    'n_estimators': range(60,160,10) ,
    'max_depth': [4,8,12] ,
    'max_features':[4,5]
    
    }
# Create a based model
rf = RandomForestClassifier(random_state=10)
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, cv=5, 
                           n_jobs = -1)

In [None]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)

In [None]:
# printing the optimal accuracy score and hyperparameters
print('We can get accuracy of',grid_search.best_score_,'using',grid_search.best_params_)