In [36]:
import pandas as pd
import numpy as np
from sklearn import tree
import graphviz
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.cross_validation import cross_val_score
from IPython.display import IFrame

# Decision tree demo for Titanic data

# read data (replace with your own path)
df = pd.read_csv('phishing.csv', sep=';')

# drop rows with missing values
df.dropna(axis=0, how='any', inplace=True)
df.head(10)



Unnamed: 0,having_IP_Address,URL_Length,Shortining_Service,having_At_Symbol,double_slash_redirecting,Prefix_Suffix,having_Sub_Domain,SSLfinal_State,Domain_registeration_length,Favicon,...,popUpWindow,Iframe,age_of_domain,DNSRecord,web_traffic,Page_Rank,Google_Index,Links_pointing_to_page,Statistical_report,Result
0,-1,1,1,1,-1,-1,-1,-1,-1,1,...,1,1,-1,-1,-1,-1,1,1,-1,-1
1,1,1,1,1,1,-1,0,1,-1,1,...,1,1,-1,-1,0,-1,1,1,1,-1
2,1,0,1,1,1,-1,-1,-1,-1,1,...,1,1,1,-1,1,-1,1,0,-1,-1
3,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,1,-1,1,-1,1,-1
4,1,0,-1,1,1,-1,1,1,-1,1,...,-1,1,-1,-1,0,-1,1,1,1,1
5,-1,0,-1,1,-1,-1,1,1,-1,1,...,1,1,1,1,1,-1,1,-1,-1,1
6,1,0,-1,1,1,-1,-1,-1,1,1,...,1,1,1,-1,-1,-1,1,0,-1,-1
7,1,0,1,1,1,-1,-1,-1,1,1,...,1,1,-1,-1,0,-1,1,0,1,-1
8,1,0,-1,1,1,-1,1,1,-1,1,...,1,1,1,-1,1,1,1,0,1,1
9,1,1,-1,1,1,-1,-1,1,-1,1,...,1,1,1,-1,0,-1,1,0,1,-1


In [37]:
# save column headings into a list
colnames = df.columns.get_values()
colnames

array(['having_IP_Address', 'URL_Length', 'Shortining_Service',
       'having_At_Symbol', 'double_slash_redirecting', 'Prefix_Suffix',
       'having_Sub_Domain', 'SSLfinal_State',
       'Domain_registeration_length', 'Favicon', 'port', 'HTTPS_token',
       'Request_URL', 'URL_of_Anchor', 'Links_in_tags', 'SFH',
       'Submitting_to_email', 'Abnormal_URL', 'Redirect', 'on_mouseover',
       'RightClick', 'popUpWindow', 'Iframe', 'age_of_domain', 'DNSRecord',
       'web_traffic', 'Page_Rank', 'Google_Index',
       'Links_pointing_to_page', 'Statistical_report', 'Result'], dtype=object)

In [38]:
X = df.loc[:,'having_IP_Address':'Statistical_report']
Y = df.loc[:,'Result']

In [39]:
# decision tree classification
classifier = tree.DecisionTreeClassifier(max_depth=3)
classifier.fit(X,Y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [40]:
# visualize
dot_data = tree.export_graphviz(classifier, out_file=None, feature_names=colnames[:30], class_names=['no','yes'])
graph = graphviz.Source(dot_data) 
graph.render("phishing")

IFrame("phishing.pdf", width=900, height=700)

In [41]:
# predict
Y_pred = classifier.predict(X)

# output confusion matrix
cm = confusion_matrix(Y, Y_pred)
print("Confusion matrix:\n",cm)

accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
print("Accuracy calculated from the training set = %.3f" % (accuracy))

print(classification_report(Y, Y_pred, target_names=['no', 'yes']))

Confusion matrix:
 [[4425  473]
 [ 545 5612]]
Accuracy calculated from the training set = 0.908
             precision    recall  f1-score   support

         no       0.89      0.90      0.90      4898
        yes       0.92      0.91      0.92      6157

avg / total       0.91      0.91      0.91     11055



In [42]:
# cross-validate
# number of folds
k = 10
scores = cross_val_score(estimator=classifier,
                        X=X,
                        y=Y,
                        scoring="accuracy",
                        cv=k)
print("Accuracies from %d individual folds:" % k)
print(scores)
print("Accuracy calculated using %d-fold cross validation = %.3f" % (k, scores.mean()))

Accuracies from 10 individual folds:
[ 0.88607595  0.88698011  0.90506329  0.89783002  0.90777577  0.92224231
  0.90506329  0.90135747  0.91394928  0.91485507]
Accuracy calculated using 10-fold cross validation = 0.904


In [43]:
# Experiment with min_samples_leaf to find the best model.

for leaf in range(201,-1,-20):
    print("min_samples_leaf: %d" % leaf)
    classifier = tree.DecisionTreeClassifier(min_samples_leaf=leaf)
    classifier.fit(X,Y)
    Y_pred = classifier.predict(X)
    cm = confusion_matrix(Y, Y_pred)
    accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[1][1]+cm[0][1]+cm[1][0])
    print("  Accuracy calculated from the training set = %.3f" % (accuracy))
    
    scores = cross_val_score(estimator=classifier,
                        X=X,
                        y=Y,
                        scoring="accuracy",
                        cv=k)
    print("  Accuracy calculated using %d-fold cross validation = %.3f" % (k, scores.mean()))
    
    

min_samples_leaf: 201
  Accuracy calculated from the training set = 0.923
  Accuracy calculated using 10-fold cross validation = 0.918
min_samples_leaf: 181
  Accuracy calculated from the training set = 0.922
  Accuracy calculated using 10-fold cross validation = 0.918
min_samples_leaf: 161
  Accuracy calculated from the training set = 0.925
  Accuracy calculated using 10-fold cross validation = 0.917
min_samples_leaf: 141
  Accuracy calculated from the training set = 0.925
  Accuracy calculated using 10-fold cross validation = 0.918
min_samples_leaf: 121
  Accuracy calculated from the training set = 0.926
  Accuracy calculated using 10-fold cross validation = 0.920
min_samples_leaf: 101
  Accuracy calculated from the training set = 0.929
  Accuracy calculated using 10-fold cross validation = 0.923
min_samples_leaf: 81
  Accuracy calculated from the training set = 0.930
  Accuracy calculated using 10-fold cross validation = 0.924
min_samples_leaf: 61
  Accuracy calculated from the trai