In [1]:
#Get plaintext secrets

from configparser import ConfigParser
parser = ConfigParser()
_ = parser.read('secrets.cfg')
neo4j_useraccount = parser.get('my_data', 'neo4j_useraccount')
neo4j_password = parser.get('my_data', 'neo4j_password')


In [13]:
import pandas as pd
import configparser
import os

#sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, matthews_corrcoef
from sklearn.metrics import roc_curve,RocCurveDisplay,confusion_matrix

#Smote modules
from imblearn.over_sampling import SMOTE,BorderlineSMOTE


#import visualization libraries
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import graphviz


#set settings for panda dataframes
pd.set_option('display.width', 0)
pd.set_option('display.max_colwidth', 0)
pd.set_option('display.max_rows', 50)

In [14]:
from graphdatascience import GraphDataScience

host = "bolt://172.17.0.2:7687"
user = neo4j_useraccount
password= neo4j_password

gds = GraphDataScience(host, auth=(user, password))

In [15]:
%%time
#Get all the processes
all_processes= gds.run_cypher("""
MATCH (p:Process)
RETURN p.CommandLine_length as CommandLine_length,p.network_degree as Network_degree,p.direct_children as Direct_children,p.ProcessGuid as ProcessGuid,p.numbers_of_executed_commands as Numbers_of_executed_commands,p.numbers_of_files_created as Numbers_of_files_created,p.malicious as Malicious
""") 

all_processes.head()

CPU times: user 42.6 s, sys: 4.36 s, total: 46.9 s
Wall time: 55.5 s


Unnamed: 0,CommandLine_length,Network_degree,Direct_children,ProcessGuid,Numbers_of_executed_commands,Numbers_of_files_created,Malicious
0,59.0,0.0,5.0,28f0b276-992e-4b7b-9b4f-82394971205f,0.0,0.0,0
1,55.0,0.0,0.0,e931d104-d525-407f-8d9e-89000a32be31,0.0,0.0,0
2,29.0,0.0,0.0,67b2d286-8932-40ca-836a-04deb6707f0b,0.0,0.0,0
3,49.0,0.0,2.0,faf7dd7a-af5b-47ad-a81e-81681dd7f57b,0.0,9.0,0
4,41.0,0.0,0.0,54988315-be0e-40a1-a89a-aa8c54cbc688,0.0,0.0,0


In [16]:
df=all_processes.fillna(0)

In [17]:
#import libraries for running randomforest
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree


In [154]:
#Get the variables into x and y for readability
X=df[['CommandLine_length', 'Network_degree', 'Direct_children', 'Numbers_of_executed_commands', 'Numbers_of_files_created']]
y=df['Malicious']

In [155]:
from collections import Counter
# summarize class distribution
counter = Counter(y)
print(counter)

Counter({0: 1132007, 1: 13})


In [156]:
%%time
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)



CPU times: user 73.3 ms, sys: 3.87 ms, total: 77.2 ms
Wall time: 75.3 ms


In [157]:
%%time
w = 10 # The weight for the positive class

model=RandomForestClassifier(max_depth=4, random_state=0,class_weight={0: 1, 1: w})
model.fit(X_train, y_train)




CPU times: user 9.51 s, sys: 7 ms, total: 9.51 s
Wall time: 9.57 s


In [158]:
%%time
y_pred = model.predict(X_test)

CPU times: user 974 ms, sys: 1 ms, total: 975 ms
Wall time: 980 ms


In [159]:
%%time
# evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1score = 2*(precision * recall)/(precision + recall)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("f1score: ", f1score)

Accuracy:  0.9999970554112707
Precision:  1.0
Recall:  0.8
f1score:  0.888888888888889
CPU times: user 132 ms, sys: 980 µs, total: 133 ms
Wall time: 133 ms


In [167]:
import graphviz

from sklearn.tree import export_graphviz

fn=['CommandLine_length', 'Network_degree', 'Direct_children', 'Numbers_of_executed_commands', 'Numbers_of_files_created']
cn=['malign', 'malicious']

dot_data = export_graphviz(model.estimators_[0], out_file=None, 
                           feature_names=fn, 
                           class_names=cn, 
                           filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render('parent_child_malware_tree')

'parent_child_malware_tree.pdf'

In [168]:
#clf.plot_tree(model, num_trees=model.best_iteration)

In [169]:
#########try the same using smote 

In [179]:
#Get the variables into x and y for readability
X=df[['CommandLine_length', 'Network_degree', 'Direct_children', 'Numbers_of_executed_commands', 'Numbers_of_files_created']]
y=df['Malicious']

In [180]:
%%time
# split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=8)


CPU times: user 81.3 ms, sys: 3 ms, total: 84.3 ms
Wall time: 84.5 ms


In [181]:
#BorderlineSMOTE
#since we have very few malicious events
smote_border = BorderlineSMOTE(random_state = 42,k_neighbors=3)
X_train_smote, y_train_smote = smote_border.fit_resample(X_train, y_train)

In [182]:
counter = Counter(y_train_smote)
print(counter)

Counter({0: 792406, 1: 792406})


In [183]:

w = 1 # The weight for the positive class

model=RandomForestClassifier(max_depth=4, random_state=2,class_weight={0:1, 1: w})
model.fit(X_train_smote, y_train_smote)


In [184]:
y_pred = model.predict(X_test)

In [185]:
# evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1score = 2*(precision * recall)/(precision + recall)

print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("f1score: ", f1score)

Accuracy:  1.0
Precision:  1.0
Recall:  1.0
f1score:  1.0


In [177]:
#import visualization libraries
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [178]:
import graphviz

from sklearn.tree import export_graphviz

fn=['CommandLine_length', 'Network_degree', 'Direct_children', 'Numbers_of_executed_commands', 'Numbers_of_files_created']
cn=['malign', 'malicious']

dot_data = export_graphviz(model.estimators_[0], out_file=None, 
                           feature_names=fn, 
                           class_names=cn, 
                           filled=True, rounded=True, special_characters=True)
graph = graphviz.Source(dot_data)
graph.render('smote_parent_child_malware_tree')

'smote_parent_child_malware_tree.pdf'