In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score
from sklearn.metrics import precision_recall_fscore_support
import graphviz 
import pydotplus
import pandas as pd
import collections
import numpy as np

In [2]:
file_data = "result/feature_vector_final_et_balanced.csv"
file_features = "result/feature_vector_index_map_final_et_balanced.csv"

In [3]:
output_folder = "result/"

In [6]:
features_df = pd.read_csv(file_features, index_col=0)

In [7]:
features_df.head()

Unnamed: 0,feature
0,insider
1,Insider
2,INSIDER
3,insiders
4,TradingThe


In [8]:
features_df.shape

(2012, 1)

In [9]:
TextFileReader = pd.read_csv(file_data,index_col = 0, chunksize=1000)  # the number of rows per chunk

dfList = []
for df in TextFileReader:
    dfList.append(df)

dataset = pd.concat(dfList,sort=False)

In [11]:
dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [12]:
features = features_df['feature'].tolist()
features.pop()
classes = ['non-infected', 'infected']

In [13]:
X = dataset.iloc[:,:-1].values
y = dataset.iloc[:,-1].values

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=35, test_size = .3, shuffle = True)
print (dataset.head())

     0    1    2    3    4    5    6    7    8    9  ...  2002  2003  2004  \
0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
1  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
2  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
3  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   
4  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...   0.0   0.0   0.0   

   2005  2006  2007  2008  2009  2010  2011  
0   0.0   0.0   0.0   0.0   0.0   0.0   1.0  
1   0.0   0.0   0.0   0.0   0.0   0.0   1.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   1.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   1.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   1.0  

[5 rows x 2012 columns]


In [15]:
clf = DecisionTreeClassifier(max_depth=35,min_samples_leaf=2,min_samples_split=2, class_weight={0:.9999, 1:.0001})
clf.fit(X_train,y_train)

DecisionTreeClassifier(class_weight={0: 0.9999, 1: 0.0001}, max_depth=35,
                       min_samples_leaf=2)

In [16]:
print("showing prediction results (first 10) [1=infected, 0 = non infected]:")
y_pred = clf.predict(X_test)
print(y_pred[:5])
a = X_test[:1]

showing prediction results (first 10) [1=infected, 0 = non infected]:
[0. 0. 0. 0. 0.]


In [17]:
print("a:")
print(a)

a:
[[0. 0. 0. ... 0. 0. 0.]]


In [18]:
y_pred

array([0., 0., 0., ..., 0., 0., 0.])

In [19]:
# shows the end point of the tree traverse by a sample
print("Returns the index of the leaf that each sample is predicted as:")
index_of_leaf = clf.apply(a)
print(index_of_leaf)

Returns the index of the leaf that each sample is predicted as:
[8]


In [20]:
#decision path shows the nodes of the tree that were traverse by the sample.
print("decision path:")
d_path = clf.decision_path(a)
print(d_path)

decision path:
  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	1
  (0, 6)	1
  (0, 7)	1
  (0, 8)	1


In [21]:
print("nodes in the decision path:")
n_d_path = np.unique(np.sort( d_path.indices))
print(n_d_path)

nodes in the decision path:
[0 1 2 3 4 5 6 7 8]


In [22]:
print("probability of each class:")
print(clf.predict_proba(a))

probability of each class:
[[1. 0.]]


In [23]:
print("Feature importances:")
feature_importances = clf.feature_importances_
print(feature_importances)

Feature importances:
[0.87141096 0.07545266 0.01912889 ... 0.         0.         0.        ]


In [24]:
#accuracy -number of instance correctly classified
acsc = accuracy_score(y_test, y_pred) 
print("accuracy (percentage  of instance classified correctly):")
print(acsc)

accuracy (percentage  of instance classified correctly):
0.9967438494934877


In [25]:
print("confusion matrix:")
cm = confusion_matrix(y_test, y_pred)
df_cm = pd.DataFrame([[cm[1][1], cm[0][0],cm[0][1], cm[1][0]]], 
                        index=[0],
                        columns=['True Positives','True Negatives', 'False Positives', 'False Negatives'])
print(df_cm)

confusion matrix:
   True Positives  True Negatives  False Positives  False Negatives
0             421            2334                2                7


In [26]:
#precision, recall, fscore, support
precision, recall, fscore, support = precision_recall_fscore_support(y_test, y_pred,average='binary')

In [27]:
df_metrics = pd.DataFrame([[acsc, precision, recall, fscore]], 
                        index=[0],
                        columns=['accuracy','precision', 'recall', 'fscore'])

In [28]:
print(df_metrics)

   accuracy  precision    recall    fscore
0  0.996744   0.995272  0.983645  0.989424


In [29]:
dot_data = tree.export_graphviz(clf, out_file=None, 
                         feature_names=features,  
                         class_names=classes,  
                         filled=True, rounded=True,  
                         special_characters=True)

In [30]:
graph = graphviz.Source(dot_data)
graph.render("wannacry")

'wannacry.pdf'

In [31]:
# Draw graph
graph = pydotplus.graph_from_dot_data(dot_data)  

In [32]:
# Show graph
graph.write_pdf("wannacry_et.pdf")

True

In [33]:
color = 'red'
edges = collections.defaultdict(list)

In [34]:
for edge in graph.get_edge_list():
    edges[edge.get_source()].append(int(edge.get_destination()))

In [35]:
for edge in edges:
    edges[edge].sort()  
    for i in range(2):
        dest = graph.get_node(str(edges[edge][i]))[0]
        if edges[edge][i] in n_d_path:
            dest.set_fillcolor(color)
graph.write_png('sample_et.png')

True