In [65]:
import pandas as pd
import numpy as np
from sklearn import tree
import graphviz
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from IPython.display import IFrame

df = pd.read_csv('./phishing.csv', sep=';')

#df.dropna(axis=0, how='any', inplace=True)
#df.describe()
#df.head(10)
#df.dtypes

In [5]:
colnames = df.columns.get_values()
# explanatory variables (dataframe)
X = df.loc[:, 'having_IP_Address':'Statistical_report']

# response variable (series)
Y = df.loc[:, 'Result']

  """Entry point for launching an IPython kernel.


In [58]:
# Experiment with min_samples_leaf to find the best model.

# number of folds for cross-validation
k = 10

for leaf in range(2001,-1,-500):
    print("min_samples_leaf: %d" % leaf)
    classifier = tree.DecisionTreeClassifier(max_depth=3, min_samples_leaf=leaf)
    classifier.fit(X,Y)
    Y_pred = classifier.predict(X)
    cm = confusion_matrix(Y, Y_pred)
    accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
    print("  Accuracy calculated from the training set = %.3f" % (accuracy))
    
    scores = cross_val_score(estimator=classifier,
                        X=X,
                        y=Y,
                        scoring="accuracy",
                        cv=k)
    print("  Accuracy calculated using %d-fold cross validation = %.3f" % (k, scores.mean()))

min_samples_leaf: 2001
  Accuracy calculated from the training set = 0.889
  Accuracy calculated using 10-fold cross validation = 0.889
min_samples_leaf: 1501
  Accuracy calculated from the training set = 0.889
  Accuracy calculated using 10-fold cross validation = 0.889
min_samples_leaf: 1001
  Accuracy calculated from the training set = 0.889
  Accuracy calculated using 10-fold cross validation = 0.889
min_samples_leaf: 501
  Accuracy calculated from the training set = 0.889
  Accuracy calculated using 10-fold cross validation = 0.885
min_samples_leaf: 1
  Accuracy calculated from the training set = 0.908
  Accuracy calculated using 10-fold cross validation = 0.904


In [63]:
# Best value for min_samples_leaf was the smallest (1) when max depth of tree was 2 (to ensure a simple/compact tree)
# Validation accuracy was 90.4 %

# Visualize
dot_data = tree.export_graphviz(classifier, out_file=None, feature_names=colnames[:30], class_names=['legitimate', 'phishing'])
graph = graphviz.Source(dot_data) 
graph.render("phishing_tree")
display(IFrame("./phishing_tree.pdf", width=900, height=700))

In [64]:
# output confusion matrix
cm = confusion_matrix(Y, Y_pred)
print("Confusion matrix:\n",cm)

print(classification_report(Y, Y_pred, target_names=['legitimate', 'phishing']))

Confusion matrix:
 [[4425  473]
 [ 545 5612]]
              precision    recall  f1-score   support

  legitimate       0.89      0.90      0.90      4898
    phishing       0.92      0.91      0.92      6157

    accuracy                           0.91     11055
   macro avg       0.91      0.91      0.91     11055
weighted avg       0.91      0.91      0.91     11055



# INSTRUCTIONS

    IF website is using https - website a little more likely legitimate
        AND IF <33 % of the < a > tags have different domain as website or are empty - website very likely legitimate
        AND IF >33 % have different domain or empty
            AND IF <17 % links are in < Meta > < Script > and < Link > tags - website very likely legitimate

    IF website is not using https - website a little more likely phishing
        AND IF <33 % of the < a > tags have different domain as website or are empty - website more likely legitimate
            AND IF Domain Name Part Includes (-) Symbol - website likely legitimate
        AND IF >33 % have different domain or empty -  website very likely phishing
            AND IF website has no traffic or is not listed in Alexa database - very likely phishing
            AND IF website has low/high traffic - still likely phishing
