In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics,preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.ensemble import AdaBoostClassifier
from imblearn.under_sampling import RandomUnderSampler
from pydotplus import graph_from_dot_data
from IPython.display import Image

1. Using the Scikit-Learn Library train the Decision Tree Classifier to the attached PhishingVsBenignURL data set using all of the features at once. (Dataset is originally from here: https://www.unb.ca/cic/datasets/url-2016.html )

In [2]:
seed = 1
target_index = ['benign','phishing']
data = pd.read_csv('DataSetForPhishingVSBenignUrl.csv')
data = data[data.URL_Type_obf_Type.isin(target_index)].dropna().reset_index(drop=True)
rus = RandomUnderSampler(random_state=seed) # random balance sample function
standard = preprocessing.StandardScaler() # standardize data func
X,y = rus.fit_resample(standard.fit_transform(data[data.columns[:-1]].values),
                       data[data.columns[-1]].values)

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=seed,stratify=y)
print("Total training phishing {} samples\nTotal training benign {} samples".format(
                np.sum(y_train == 'phishing'),np.sum(y_train == 'benign')))
print("Total test phishing {} samples\nTotal test benign {} samples".format(
                np.sum(y_test == 'phishing'),np.sum(y_test == 'benign')))

Total training phishing 1896 samples
Total training benign 1896 samples
Total test phishing 813 samples
Total test benign 813 samples


In [3]:
tree_models = [[],[]]
adas = [[],[]]
accuracies = [[],[]]
criterion = ['gini', 'entropy']
for j,c in enumerate(criterion):
    for i,d in enumerate([1,3,6,9,12,15,18]):
        tree_models[j].append(DecisionTreeClassifier(criterion=c,max_depth=d,random_state=seed))
        adas[j].append(AdaBoostClassifier(base_estimator=tree_models[j][i],
                                          n_estimators=500,
                                          learning_rate=0.1,random_state=seed))
        tree_models[j][i].fit(X_train, y_train)
        adas[j][i].fit(X_train,y_train)
        accuracies[j].append(np.round(tree_models[j][i].score(X_test,y_test)*100,2))
        print("="*80)
        print(f"{c} impurity max tree depth {d} accuracy: {accuracies[j][i]}%")
        print(f"Adaboost with tree base depth {d} accuracy: {np.round(adas[j][i].score(X_test,y_test)*100,2)}%")
print("="*80)

gini impurity max tree depth 1 accuracy: 80.69%
Adaboost with tree base depth 1 accuracy: 96.0%
gini impurity max tree depth 3 accuracy: 90.84%
Adaboost with tree base depth 3 accuracy: 97.48%
gini impurity max tree depth 6 accuracy: 94.59%
Adaboost with tree base depth 6 accuracy: 96.92%
gini impurity max tree depth 9 accuracy: 94.53%
Adaboost with tree base depth 9 accuracy: 96.74%
gini impurity max tree depth 12 accuracy: 94.9%
Adaboost with tree base depth 12 accuracy: 96.43%
gini impurity max tree depth 15 accuracy: 95.08%
Adaboost with tree base depth 15 accuracy: 95.14%
gini impurity max tree depth 18 accuracy: 95.08%
Adaboost with tree base depth 18 accuracy: 95.08%
entropy impurity max tree depth 1 accuracy: 80.69%
Adaboost with tree base depth 1 accuracy: 95.88%
entropy impurity max tree depth 3 accuracy: 90.34%
Adaboost with tree base depth 3 accuracy: 97.36%
entropy impurity max tree depth 6 accuracy: 93.91%
Adaboost with tree base depth 6 accuracy: 97.05%
entropy impurity 

### Apply the Scikit Learn AdaBoost Classifier code to the dataset for classifying phishing vs benign using and all feature at once and upload your .ipynb file. Use a Decision Tree Classifier at your base classifier. Use decision trees of varying depths(1,3,6,9,12,15,18 for both gini and entropy criterion) for the base classifier. Compare your results with those you obtained last week when you used the Scikit Decision Tree Classifier(Week 5 assignment)

* Using Gini impurity with tree depth 1, adaboost is significantly better than only use one tree i.e. 80.69% accuracy for tree vs. 96% accuracy for adaboost. Similarly, a tree depth of 3 and 6, adaboost is also significantlt better than tree itself i.e. 90.84% accuracy for tree only vs. 97.48% accuracy for adaboost. For depth of 6, 9 and 12 have very similar result because there is no significantly difference. However, using a larger size of tree depth, the accuracy of adaboost is decreasing. On the other hand, tree only accuracy is more closer to adaboost. In the case of using tree depth 15 and 18, they have same result.
* Using Entropy impurity with tree depth 1 and 3, adaboost is significantly better than only use one tree i.e. 80.69% accuracy for tree vs. 95.88% accuracy for adaboost and 90.34% accuracy for tree vs. 97.36% accuracy for adaboost respectively. However, for depth of 6, 9, 15 and 18 have very similar or no significantly different result. Especially when the depths are 15 and 18. When the depth size is larger, it seems like entropy is better than gini but no significantly difference. However, when we use larger depth, i.e. greater 6 the running time of adaboost take much longer comparing to tree itself only. In conclusion, if the depth is greater than or equal to 6 there is no need to use Adaboost at all due to the computation complexity and they have very similar result. On the other hand, Adaboost has significantly improvement for the case of tree depth less than 6 with highest accuracy i.e. the case of depth 3.