In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import graphviz

## Step 1
Apply the Scikit Learn AdaBoost Classifier code to the dataset for classifying phishing vs benign using and all feature at once and upload your .ipynb file. Use a Decision Tree Classifier at your base classifier. Use decision trees of varying depths(1,3,6,9,12,15,18 for both gini and entropy criterion) for the base classifier.

In [None]:
# Reading Data
df = pd.read_csv('/content/drive/MyDrive/2024 Spring/ML/Lecture 5/DataSetForPhishingVSBenignUrl.csv')


class_names = df['URL_Type_obf_Type'].unique()

df.replace([np.inf, -np.inf], np.nan, inplace=True)
df.fillna(df.mode().iloc[0], inplace=True) # Fill NaN
# df.columns[df.isna().any()].tolist()

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['URL_Type_obf_Type'] = le.fit_transform(df['URL_Type_obf_Type'])

df = df.astype('float32')

y = df['URL_Type_obf_Type']
X = df.drop('URL_Type_obf_Type', axis=1)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=1, stratify=y)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train_std = pd.DataFrame(sc.fit_transform(X_train), columns=X_train.columns)
X_test_std = pd.DataFrame(sc.transform(X_test), columns=X_test.columns)

In [None]:
feature_names = X.columns

In [None]:
# Train Classifier
tree_classifier = DecisionTreeClassifier(random_state=0)
tree_classifier.fit(X_train, y_train)

In [None]:
X_train_std.shape

(29365, 79)

In [None]:
gini_results = []

depths = [1,3,6,9,12,15,18]


for depth in depths:
    tree_gini = DecisionTreeClassifier(criterion='gini', max_depth=depth, random_state=0)

    abc = AdaBoostClassifier(base_estimator=tree_gini)
    model = abc.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    gini_results.append((depth, accuracy))

print("Gini: ")
for result in gini_results:
    print(f"Depth: {result[0]}, Accuracy: {result[1]}")



Gini: 
Depth: 1, Accuracy: 0.6473712884772541
Depth: 3, Accuracy: 0.7044402070280578
Depth: 6, Accuracy: 0.901116861890493
Depth: 9, Accuracy: 0.9723508580768183
Depth: 12, Accuracy: 0.9758921274856988
Depth: 15, Accuracy: 0.9773903568509943
Depth: 18, Accuracy: 0.9795695995641515


In [None]:
entropy_results = []

depths = [1,3,6,9,12,15,18]

for depth in depths:
    tree_entropy = DecisionTreeClassifier(criterion='entropy', max_depth=depth, random_state=0)

    abc = AdaBoostClassifier(estimator=tree_entropy)
    model = abc.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    entropy_results.append((depth, accuracy))

print("Entropy: ")
for result in entropy_results:
    print(f"Depth: {result[0]}, Accuracy: {result[1]}")

Entropy: 
Depth: 1, Accuracy: 0.6453282484336693
Depth: 3, Accuracy: 0.6921819667665486
Depth: 6, Accuracy: 0.9137837101607191
Depth: 9, Accuracy: 0.9708526287115228
Depth: 12, Accuracy: 0.9772541541814219
Depth: 15, Accuracy: 0.9777989648597113
Depth: 18, Accuracy: 0.9724870607463906


## Step 2
Compare your results with those you obtained last week when you used the Scikit Decision Tree Classifier(Week 5 assignment).

In Week 5 Assignment, the best accuracy for both Gini and Entropy decision trees is around 78% at Depth 6. AdaBoost enhances the performance, achieving the best accuracy of about 90% at Depth 6 and approximately 97% at Depth 18 for both Gini and Entropy. The Entropy Decision Tree starts to overfit at Depth 15.

Regular decision trees are trained using recursive partitioning, and the weights of each sample remain unchanged during the training process. Each tree is built independently without considering the performance of previous rounds.

AdaBoost employs ensemble learning and trains the decision trees iteratively. Each training round adjusts sample weights based on the performance of the previous classifier, giving more attention to previously misclassified samples, thereby constructing a strong classifier.