In [0]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")
#Please ignore the warnings with version change

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [0]:
trainfile = r'/gdrive/My Drive/CIS508/ASN 2/Bank Data - TRAIN.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/My Drive/CIS508/ASN 2/Bank Data - TEST.csv'
testData = pd.read_csv(testfile)  #creates a dataframe


print(trainData.shape)
print(testData.shape)



(4521, 17)
(45211, 17)


In [0]:
trainData.head()

In [0]:
testData.head()

In [0]:
#List of Categorical Features
categoricalFeatures = ["job", "marital", 'education', "default", "housing", "loan", 'contact', 'month', "poutcome"]

#Combine Train and test for one Hot Encoding
combined_Data = pd.concat([trainData,testData], keys=[0,1])

#Do one Hot encoding for categorical features
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)

#Separate Train data and test data
trainData = combined_Data.xs(0)
testData = combined_Data.xs(1)

y_train = trainData["y"]
X_train = trainData.drop(["y"], axis=1) #extracting training data without the target column
y_test = testData["y"]
X_test = testData.drop(["y"], axis=1) #extracting training data without the target column



In [0]:
#Decision Tree Classifier ========================================================================
#CONSTRUCT DEFAULT DECISION TREE AND OBTAIN RESPECTIVE ACCURACY 
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_predict=clf.predict(X_test)
print("accuracy Score (training) for Decision TreE:{0:6f}".format(clf.score(X_test,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))



accuracy Score (training) for Decision TreE:0.884497
Confusion Matrix for Decision Tree
[[37307  2615]
 [ 2607  2682]]


In [0]:
#Hyperparameter tuning done for decision tree classifier
#do random search
print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_leaf' : range(2,200,5),'max_depth': range(10,250,10),'criterion':['gini','entropy']}
clf_random = RandomizedSearchCV(clf,parameters,n_iter=15,cv=5)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)
#Now do grid search
print("GridSearchCV-Decision tree")
clf_grid = GridSearchCV(clf,parameters)
clf_grid.fit(X_train, y_train)
grid_parm1=clf_grid.best_params_
print(grid_parm1)



RandomizedSearchCV-Decision tree
{'min_samples_leaf': 32, 'max_depth': 80, 'criterion': 'entropy'}
GridSearchCV-Decision tree
{'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 72}


In [0]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier
#Construct Decision Trees using the best parameters
clf = DecisionTreeClassifier(**grid_parm)
clfr = DecisionTreeClassifier(**grid_parm1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clfr.fit(X_train,y_train)
clfr_predict = clfr.predict(X_test)




In [0]:
#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (training) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))

print("accuracy Score (training) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clfr_predict))

clf_cv_score = cross_val_score(clfr, X_train, y_train, cv=10, scoring="balanced_accuracy")
print(clf_cv_score)
print('\n')




accuracy Score (training) after hypertuning randomized search for Decision Tree:0.893765
accuracy Score (training) after hypertuning grid search for Decision Tree:0.896596
Confusion Matrix after hypertuning for Decision Tree
[[38694  1228]
 [ 3575  1714]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.93      0.96      0.94     39922
         yes       0.58      0.43      0.49      5289

    accuracy                           0.90     45211
   macro avg       0.75      0.69      0.72     45211
weighted avg       0.89      0.90      0.89     45211

[0.6371934  0.65346154 0.66269231 0.65096154 0.67567308 0.62336538
 0.66980769 0.64009615 0.65346154 0.63048077]




In [0]:
#Normal randomforest==============================================================================
#=================================================================================================
rand_parameters={'min_samples_leaf' : range(1,50,1),'max_depth': range(10,200,10),'max_features':[7,8,9],'n_estimators':[50,500,20]}
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predict=rfc.predict(X_test)
print("accuracy Score (training) for RandomForest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
#do random search with cross-validation
rfc_random = RandomizedSearchCV(rfc,rand_parameters,n_iter=15,cv=5)
rfc_random.fit(X_train, y_train)
grid_parm_rfc=rfc_random.best_params_
print(grid_parm_rfc)
#create new classifier using the best parameters
rfc= RandomForestClassifier(**grid_parm_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
print("accuracy Score (training) after hypertuning for Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))
rfc_cv_score = cross_val_score(rfc, X_train, y_train, cv=5)
print(rfc_cv_score)
print('\n')



accuracy Score (training) for RandomForest:0.907500
Confusion Matrix for Random Forest:
[[39105   817]
 [ 3365  1924]]
{'n_estimators': 500, 'min_samples_leaf': 1, 'max_features': 9, 'max_depth': 150}
accuracy Score (training) after hypertuning for Random Forest:0.913074
Confusion Matrix after hypertuning for Random Forest:
[[39086   836]
 [ 3094  2195]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.93      0.98      0.95     39922
         yes       0.72      0.42      0.53      5289

    accuracy                           0.91     45211
   macro avg       0.83      0.70      0.74     45211
weighted avg       0.90      0.91      0.90     45211

[0.9038674  0.8960177  0.8960177  0.89712389 0.89933628]




In [0]:
#GRID SEARCH----------------------------------------
print("GridSearchCV-Random Forest")
rfc_grid = GridSearchCV(rfc,rand_parameters)
rfc_grid.fit(X_train, y_train)
grid_parm_rfc1=rfc_grid.best_params_
print(grid_parm_rfc1)

#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier 
rfcg= RandomForestClassifier(**grid_parm_rfc1)

rfcg.fit(X_train,y_train)
rfcg_predict = rfcg.predict(X_test)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testing) after hypertuning grid search for Random Forest:{0:6f}".format(rfcg.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Random Forest")
print(confusion_matrix(y_test,rfcg_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfcg_predict))

#clf_cv_score = cross_val_score(clf, X_train, y_train, cv=10, scoring="balanced_accuracy")
#print(clf_cv_score)
print('\n')

In [0]:
#Gradient Boosting================================================================================
search_grid={'n_estimators':[1,400,25],'learning_rate':[0.05,.5,.5],'min_samples_leaf' : range(1,200,10),'max_depth': range(20,350,15)}
abc =GradientBoostingClassifier()
abc.fit(X_train, y_train)
abc_predict=abc.predict(X_test)
print("accuracy Score (training) for Boosting:{0:6f}".format(abc.score(X_test,y_test)))
print("Confusion Matrix for boosting:")
print(confusion_matrix(y_test,abc_predict))
abc_random = RandomizedSearchCV(abc,search_grid,n_iter=15)
abc_random.fit(X_train, y_train)
grid_parm_abc=abc_random.best_params_
print(grid_parm_abc)
abc= GradientBoostingClassifier(**grid_parm_abc)
abc.fit(X_train,y_train)
abc_predict = abc.predict(X_test)
print("accuracy Score (training) after hypertuning for Boosting:{0:6f}".format(abc.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Boosting:")
print(confusion_matrix(y_test,abc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,abc_predict))
abc_cv_score = cross_val_score(abc, X_train, y_train, cv=5)
print(abc_cv_score)
print('\n')


accuracy Score (training) for Boosting:0.906350
Confusion Matrix for boosting:
[[38788  1134]
 [ 3100  2189]]
{'n_estimators': 1, 'min_samples_leaf': 61, 'max_depth': 275, 'learning_rate': 0.5}
accuracy Score (training) after hypertuning for Boosting:0.895269
Confusion Matrix after hypertuning for Boosting:
[[38829  1093]
 [ 3642  1647]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.91      0.97      0.94     39922
         yes       0.60      0.31      0.41      5289

    accuracy                           0.90     45211
   macro avg       0.76      0.64      0.68     45211
weighted avg       0.88      0.90      0.88     45211

[0.89834254 0.89269912 0.89712389 0.8960177  0.89933628]


