In [1]:
# Import libraries

import warnings
warnings.filterwarnings('ignore')
from collections import Counter
import csv
import pandas as pd
import numpy as np
from sklearn import metrics, linear_model, tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
import time

In [2]:
# Dataset paths
DATASET_PATH = "C:\\Users\\Marek\\PycharmProjects\\DP\\venv\\KDDTrain+.txt"
DATASET_PATH_TEST = "C:\\Users\\Marek\\PycharmProjects\\DP\\venv\\KDDTest+.txt"

# Load dataset
flow_data_headers = ["duration", "protocol", "service", "flag", "src-bytes", "dst-bytes", "land", "wrong-fragment", "urgent", "hot", "num-failed-logins", "logged-in", "num-compromised", "root-shell", "su-attempted", "num-root", "num-file-creations", "num-shells", "num-access-files", "num-outbound-cmds", "is-host-login", "is-guest-login", "count", "srv-count", "serror-rate", "srv-serror-rate", "rerror-rate", "srv-rerror-rate", "same-srv-rate", "diff-srv-rate", "srv-diff-host-rate", "dst-host-count", "dst-host-srv-count", "dst-host-same-srv-rate", "dst-host-diff-srv-rate", "dst-host-same-src-port-rate", "dst-host-srv-diff-host-rate", "dst-host-serror-rate", "dst-host-srv-serror-rate", "dst-host-rerror-rate", "dst-host-srv-rerror-rate", "label", "difficulty"]
flow_data = pd.read_csv(DATASET_PATH, names=flow_data_headers)
flow_data_test = pd.read_csv(DATASET_PATH_TEST, names=flow_data_headers)

In [3]:
# Keep only Probe attacks labeled - label other entries as "other"
flow_data = flow_data.replace(['rootkit', 'ftp_write', 'buffer_overflow', 'loadmodule', 'spy', 'land', 'warezclient', 'phf', 'normal', 'pod', 'back', 'neptune', 'perl', 'imap', 'warezmaster', 'multihop', 'teardrop', 'smurf', 'guess_passwd'], 'other')
print (set(flow_data["label"]))

{'satan', 'portsweep', 'other', 'ipsweep', 'nmap'}


In [4]:
# Train data information
print ("Number of observations ::", len(flow_data.index))
print ("Number of columns ::", len(flow_data.columns))
print ("Headers ::", flow_data.columns.values)

Number of observations :: 125973
Number of columns :: 43
Headers :: ['duration' 'protocol' 'service' 'flag' 'src-bytes' 'dst-bytes' 'land'
 'wrong-fragment' 'urgent' 'hot' 'num-failed-logins' 'logged-in'
 'num-compromised' 'root-shell' 'su-attempted' 'num-root'
 'num-file-creations' 'num-shells' 'num-access-files' 'num-outbound-cmds'
 'is-host-login' 'is-guest-login' 'count' 'srv-count' 'serror-rate'
 'srv-serror-rate' 'rerror-rate' 'srv-rerror-rate' 'same-srv-rate'
 'diff-srv-rate' 'srv-diff-host-rate' 'dst-host-count'
 'dst-host-srv-count' 'dst-host-same-srv-rate' 'dst-host-diff-srv-rate'
 'dst-host-same-src-port-rate' 'dst-host-srv-diff-host-rate'
 'dst-host-serror-rate' 'dst-host-srv-serror-rate' 'dst-host-rerror-rate'
 'dst-host-srv-rerror-rate' 'label' 'difficulty']


In [5]:
# Random forest classifier, logistic regression, decision tree
# Training and validation using only the training dataset

train_x, test_x, train_y, test_y = train_test_split(flow_data[flow_data_headers[:-2]], 
                                                    flow_data[flow_data_headers[-2]], 
                                                    train_size=0.8, 
                                                    test_size=0.2)

# One-hot encoding of categorical data, use StandardScaler for other data
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), 
                                        ["protocol", "service", "flag"])], 
                                      remainder=StandardScaler())
    
# Train Random forest classifier
rfc = make_pipeline(columnTransformer, RandomForestClassifier(n_estimators=100))

start1 = time.process_time()
rfc.fit(train_x, train_y)

print('Time to train RFC: ' + str(time.process_time() - start1) + 's')

# Train logistic regression
mul_lr = make_pipeline(columnTransformer, linear_model.LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=20))

start2 = time.process_time()
mul_lr.fit(train_x, train_y)
print('Time to train LR: ' + str(time.process_time() - start2) + 's')

# Train Decision tree classifier
dt = make_pipeline(columnTransformer, tree.DecisionTreeClassifier())

start3 = time.process_time()
dt.fit(train_x, train_y)
print('Time to train DT: ' + str(time.process_time() - start3) + 's')

trafficLabels = ['satan', 'ipsweep', 'portsweep', 'nmap', 'other']

# Print out the results
print(metrics.confusion_matrix(test_y, rfc.predict(test_x), labels=trafficLabels))
print("Random forest Test Accuracy :: ", metrics.accuracy_score(test_y, rfc.predict(test_x)))
print("Random forest F1 Score :: ", metrics.f1_score(test_y, rfc.predict(test_x), average=None, labels=trafficLabels))

print(metrics.confusion_matrix(test_y, mul_lr.predict(test_x), labels=trafficLabels))
print("Logistic regression Test Accuracy :: ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))
print("Logistic regression F1 Score :: ", metrics.f1_score(test_y, mul_lr.predict(test_x), average=None, labels=trafficLabels))

print(metrics.confusion_matrix(test_y, dt.predict(test_x), labels=trafficLabels))
print("Decision tree Test Accuracy :: ", metrics.accuracy_score(test_y, dt.predict(test_x)))
print("Decision tree F1 Score :: ", metrics.f1_score(test_y, dt.predict(test_x), average=None, labels=trafficLabels))


Time to train RFC: 16.140625s
Time to train LR: 12.21875s
Time to train DT: 4.234375s
[[  700     1     0     0     7]
 [    0   719     0     2     0]
 [    0     0   552     0     0]
 [    0     3     0   298     1]
 [    0     0     1     0 22911]]
Random forest Test Accuracy ::  0.9994046437785274
Random forest F1 Score ::  [0.99431818 0.99584488 0.99909502 0.99003322 0.99980363]
[[  599     1     3     1   104]
 [    0   700     0     7    14]
 [    9     1   524     0    18]
 [    4     9     0   278    11]
 [   25    27     1     4 22855]]
Logistic regression Test Accuracy ::  0.9905139908712046
Logistic regression F1 Score ::  [0.89070632 0.95956134 0.97037037 0.93918919 0.99555691]
[[  701     0     4     0     3]
 [    0   720     0     1     0]
 [    1     0   551     0     0]
 [    0     3     0   299     0]
 [    3     0     0     0 22909]]
Decision tree Test Accuracy ::  0.9994046437785274
Decision tree F1 Score ::  [0.99221515 0.99722992 0.99548329 0.99335548 0.99986906]

In [6]:
# We can see that all the models have achieved very high evaluation metrics. 
# We will use the testing dataset later to see how much overfitting comes into play.

# With feature selection, training and validation using only the training dataset

selected_features = ['dst-host-same-src-port-rate','dst-host-count','dst-host-rerror-rate', 'rerror-rate', 'dst-host-srv-diff-host-rate', 'service', 'dst-host-srv-count', 'srv-diff-host-rate', 'dst-host-same-srv-rate', 'count', 'flag', 'dst-host-diff-srv-rate', 'protocol', 'label']
flow_data_selected = flow_data[selected_features]

train_x, test_x, train_y, test_y = train_test_split(flow_data_selected[selected_features[:-1]],
                                                    flow_data_selected[selected_features[-1]], 
                                                    train_size=0.8, 
                                                    test_size=0.2)

# One-hot encoding of categorical data, use StandardScaler for other data
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), 
                                        ["protocol", "service", "flag"])], 
                                      remainder=StandardScaler())

# Train Random forest classifier
rfc = make_pipeline(columnTransformer, RandomForestClassifier(n_estimators=100))

start1 = time.process_time()
rfc.fit(train_x, train_y)
print('Time to train RFC: ' + str(time.process_time() - start1) + 's')

# Train logistic regression
mul_lr = make_pipeline(columnTransformer, linear_model.LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=20))

start2 = time.process_time()
mul_lr.fit(train_x, train_y)
print('Time to train LR: ' + str(time.process_time() - start2) + 's')

# Train Decision Tree classifier
dt = make_pipeline(columnTransformer, tree.DecisionTreeClassifier())

start3 = time.process_time()
dt.fit(train_x, train_y)
print('Time to train DT: ' + str(time.process_time() - start3) + 's')

trafficLabels = ['satan', 'ipsweep', 'portsweep', 'nmap', 'other']

print(metrics.confusion_matrix(test_y, rfc.predict(test_x), labels=trafficLabels))
print("Random forest Test Accuracy :: ", metrics.accuracy_score(test_y, rfc.predict(test_x)))
print("Random forest F1 Score :: ", metrics.f1_score(test_y, rfc.predict(test_x), average=None, labels=trafficLabels))

print(metrics.confusion_matrix(test_y, mul_lr.predict(test_x), labels=trafficLabels))
print("Logistic regression Test Accuracy :: ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))
print("Logistic regression F1 Score :: ", metrics.f1_score(test_y, mul_lr.predict(test_x), average=None, labels=trafficLabels))

print(metrics.confusion_matrix(test_y, dt.predict(test_x), labels=trafficLabels))
print("Decision tree Test Accuracy :: ", metrics.accuracy_score(test_y, dt.predict(test_x)))
print("Decision tree F1 Score :: ", metrics.f1_score(test_y, dt.predict(test_x), average=None, labels=trafficLabels))

Time to train RFC: 15.21875s
Time to train LR: 1.203125s
Time to train DT: 0.953125s
[[  757     0     0     0     8]
 [    1   734     0     5     6]
 [    0     0   595     0     4]
 [    0     8     0   304     4]
 [    2     1     0     0 22766]]
Random forest Test Accuracy ::  0.9984520738241714
Random forest F1 Score ::  [0.99278689 0.98589657 0.99664992 0.9728     0.99945124]
[[  605     0     0     0   160]
 [    0   718     0    11    17]
 [    2     0   582     0    15]
 [    0     7     1   271    37]
 [   28    31     4     7 22699]]
Logistic regression Test Accuracy ::  0.9872990672752531
Logistic regression F1 Score ::  [0.86428571 0.95605859 0.98145025 0.89586777 0.9934569 ]
[[  755     0     0     0    10]
 [    1   735     0     5     5]
 [    2     0   595     0     2]
 [    0    10     0   302     4]
 [    6     1     2     1 22759]]
Decision tree Test Accuracy ::  0.9980551696765231
Decision tree F1 Score ::  [0.98757358 0.98525469 0.99498328 0.96794872 0.99931941]


In [7]:
# We can see that reducing the number of features sped the learning models slightly, at the expense of losing some accuracy.

# Test data information
print ("Number of observations ::", len(flow_data_test.index))
print ("Number of columns ::", len(flow_data_test.columns))
print ("Headers ::", flow_data_test.columns.values)

Number of observations :: 22544
Number of columns :: 43
Headers :: ['duration' 'protocol' 'service' 'flag' 'src-bytes' 'dst-bytes' 'land'
 'wrong-fragment' 'urgent' 'hot' 'num-failed-logins' 'logged-in'
 'num-compromised' 'root-shell' 'su-attempted' 'num-root'
 'num-file-creations' 'num-shells' 'num-access-files' 'num-outbound-cmds'
 'is-host-login' 'is-guest-login' 'count' 'srv-count' 'serror-rate'
 'srv-serror-rate' 'rerror-rate' 'srv-rerror-rate' 'same-srv-rate'
 'diff-srv-rate' 'srv-diff-host-rate' 'dst-host-count'
 'dst-host-srv-count' 'dst-host-same-srv-rate' 'dst-host-diff-srv-rate'
 'dst-host-same-src-port-rate' 'dst-host-srv-diff-host-rate'
 'dst-host-serror-rate' 'dst-host-srv-serror-rate' 'dst-host-rerror-rate'
 'dst-host-srv-rerror-rate' 'label' 'difficulty']


In [8]:
# In the testing dataset, some attack types previously unseen appear.

# With feature selection, training on training dataset and validation using only the test dataset

selected_features = ['dst-host-same-src-port-rate','dst-host-count','dst-host-rerror-rate', 'rerror-rate', 'dst-host-srv-diff-host-rate', 'service', 'dst-host-srv-count', 'srv-diff-host-rate', 'dst-host-same-srv-rate', 'count', 'flag', 'dst-host-diff-srv-rate', 'protocol', 'label']
flow_data = flow_data[selected_features]
flow_data_test = flow_data_test[selected_features]

# Classify non-Probe traffic as "other"
flow_data_test = flow_data_test.replace(['rootkit', 'ftp_write', 'buffer_overflow', 'loadmodule', 'spy', 'land', 'warezclient', 'phf', 'normal', 'pod', 'back', 'neptune', 'perl', 'imap', 'warezmaster', 'multihop', 'teardrop', 'smurf', 'guess_passwd'], 'other')

# Create one unifying label "probe"
flow_data = flow_data.replace(['satan', 'portsweep', 'ipsweep', 'nmap'], 'probe')

# Create one unifying label "probe"
flow_data_test = flow_data_test.replace(
    ['satan', 'portsweep', 'ipsweep', 'nmap', 'saint', 'mscan'], 'probe')

# Classify test-exclusive traffic as "other"
flow_data_test = flow_data_test.replace(
    ['sendmail', 'snmpgetattack', 'named', 'ps', 'apache2', 'snmpguess', 'processtable', 'worm', 'xsnoop', 'udpstorm', 'sqlattack', 'httptunnel', 'xlock', 'mailbomb', 'xterm'], 'other')

print(set(flow_data["label"]))
print(set(flow_data_test["label"]))

{'other', 'probe'}
{'other', 'probe'}


In [9]:
train_x = flow_data[selected_features[:-1]]
train_y = flow_data[selected_features[-1]]
test_x = flow_data_test[selected_features[:-1]]
test_y = flow_data_test[selected_features[-1]]

# Train Random Forest classifier
rfc = make_pipeline(columnTransformer, RandomForestClassifier(n_estimators=100))

start1 = time.process_time()
rfc.fit(train_x, train_y)
print('Time to train RFC: ' + str(time.process_time() - start1) + 's')

# Train logistic regression
mul_lr = make_pipeline(columnTransformer, linear_model.LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=20))

start2 = time.process_time()
mul_lr.fit(train_x, train_y)
print('Time to train LR: ' + str(time.process_time() - start2) + 's')

# Train Decision Tree classifier
dt = make_pipeline(columnTransformer, tree.DecisionTreeClassifier())

start3 = time.process_time()
dt.fit(train_x, train_y)
print('Time to train DT: ' + str(time.process_time() - start3) + 's')

trafficLabels = ['probe', 'other']

print(metrics.confusion_matrix(test_y, rfc.predict(test_x), labels=trafficLabels))
print("Random forest Test Accuracy :: ", metrics.accuracy_score(test_y, rfc.predict(test_x)))
print("Random forest F1 Score :: ",
      metrics.f1_score(test_y, rfc.predict(test_x), average=None, labels=trafficLabels))

print(metrics.confusion_matrix(test_y, mul_lr.predict(test_x), labels=trafficLabels))
print("Logistic regression Test Accuracy :: ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))
print("Logistic regression F1 Score :: ", metrics.f1_score(test_y, mul_lr.predict(test_x), average=None, labels=trafficLabels))

print(metrics.confusion_matrix(test_y, dt.predict(test_x), labels=trafficLabels))
print("Decision tree Test Accuracy :: ", metrics.accuracy_score(test_y, dt.predict(test_x)))
print("Decision tree F1 Score :: ", metrics.f1_score(test_y, dt.predict(test_x), average=None, labels=trafficLabels))

Time to train RFC: 20.015625s
Time to train LR: 0.75s
Time to train DT: 1.25s
[[ 1306  1115]
 [  230 19893]]
Random forest Test Accuracy ::  0.9403388928317956
Random forest F1 Score ::  [0.66009603 0.9672996 ]
[[ 1341  1080]
 [  614 19509]]
Logistic regression Test Accuracy ::  0.9248580553584103
Logistic regression F1 Score ::  [0.61288848 0.95839065]
[[ 1339  1082]
 [  543 19580]]
Decision tree Test Accuracy ::  0.9279187366926899
Decision tree F1 Score ::  [0.6223565  0.96015692]


In [15]:
# We can see that the proportion of undetected attacks rose significantly when using the testing dataset. 
# We will now remove previously unseen types of attacks, in order to test that these are the records that cause a lower accuracy.

# With feature selection, training on training dataset and validation using only the test dataset

flow_data_test = pd.read_csv(DATASET_PATH_TEST, names=flow_data_headers)
selected_features = ['dst-host-same-src-port-rate','dst-host-count','dst-host-rerror-rate', 'rerror-rate', 'dst-host-srv-diff-host-rate', 'service', 'dst-host-srv-count', 'srv-diff-host-rate', 'dst-host-same-srv-rate', 'count', 'flag', 'dst-host-diff-srv-rate', 'protocol', 'label']
flow_data = flow_data[selected_features]
flow_data_test = flow_data_test[selected_features]

# Classify non-Probe traffic as "other"
flow_data_test = flow_data_test.replace(['rootkit', 'ftp_write', 'buffer_overflow', 'loadmodule', 'spy', 'land', 'warezclient', 'phf', 'normal', 'pod', 'back', 'neptune', 'perl', 'imap', 'warezmaster', 'multihop', 'teardrop', 'smurf', 'guess_passwd'], 'other')

# Remove previously unseen attacks from the testing dataset
flow_data_test.drop(flow_data_test[flow_data_test.label == 'saint'].index, inplace=True)
flow_data_test.drop(flow_data_test[flow_data_test.label == 'mscan'].index, inplace=True)

# Create one unifying label "probe"
flow_data = flow_data.replace(['satan', 'portsweep', 'ipsweep', 'nmap'], 'probe')

# Create one unifying label "probe"
flow_data_test = flow_data_test.replace(
    ['satan', 'portsweep', 'ipsweep', 'nmap'], 'probe')

# Classify test-exclusive traffic as "other"
flow_data_test = flow_data_test.replace(
    ['sendmail', 'snmpgetattack', 'named', 'ps', 'apache2', 'snmpguess', 'processtable', 'worm', 'xsnoop', 'udpstorm', 'sqlattack', 'httptunnel', 'xlock', 'mailbomb', 'xterm'], 'other')

print(set(flow_data["label"]))
print(set(flow_data_test["label"]))

{'other', 'probe'}
{'other', 'probe'}


In [17]:
train_x = flow_data[selected_features[:-1]]
train_y = flow_data[selected_features[-1]]
test_x = flow_data_test[selected_features[:-1]]
test_y = flow_data_test[selected_features[-1]]

# Train Random Forest classifier
rfc = make_pipeline(columnTransformer, RandomForestClassifier(n_estimators=100))

start1 = time.process_time()
rfc.fit(train_x, train_y)
print('Time to train RFC: ' + str(time.process_time() - start1) + 's')

# Train logistic regression
mul_lr = make_pipeline(columnTransformer, linear_model.LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=20))

start2 = time.process_time()
mul_lr.fit(train_x, train_y)
print('Time to train LR: ' + str(time.process_time() - start2) + 's')

# Train Decision Tree classifier
dt = make_pipeline(columnTransformer, tree.DecisionTreeClassifier())

start3 = time.process_time()
dt.fit(train_x, train_y)
print('Time to train DT: ' + str(time.process_time() - start3) + 's')

trafficLabels = ['probe', 'other']

print(metrics.confusion_matrix(test_y, rfc.predict(test_x), labels=trafficLabels))
print("Random forest Test Accuracy :: ", metrics.accuracy_score(test_y, rfc.predict(test_x)))
print("Random forest F1 Score :: ",
      metrics.f1_score(test_y, rfc.predict(test_x), average=None, labels=trafficLabels))

print(metrics.confusion_matrix(test_y, mul_lr.predict(test_x), labels=trafficLabels))
print("Logistic regression Test Accuracy :: ", metrics.accuracy_score(test_y, mul_lr.predict(test_x)))
print("Logistic regression F1 Score :: ", metrics.f1_score(test_y, mul_lr.predict(test_x), average=None, labels=trafficLabels))

print(metrics.confusion_matrix(test_y, dt.predict(test_x), labels=trafficLabels))
print("Decision tree Test Accuracy :: ", metrics.accuracy_score(test_y, dt.predict(test_x)))
print("Decision tree F1 Score :: ", metrics.f1_score(test_y, dt.predict(test_x), average=None, labels=trafficLabels))

Time to train RFC: 17.328125s
Time to train LR: 0.59375s
Time to train DT: 1.109375s
[[  982   124]
 [  223 19900]]
Random forest Test Accuracy ::  0.9836544349710302
Random forest F1 Score ::  [0.84984855 0.99135676]
[[ 1011    95]
 [  614 19509]]
Logistic regression Test Accuracy ::  0.9666022893212115
Logistic regression F1 Score ::  [0.74038814 0.9821532 ]
[[  987   119]
 [  616 19507]]
Decision tree Test Accuracy ::  0.9653775495784069
Decision tree F1 Score ::  [0.72868217 0.98150897]


In [26]:
# We can see that only including attacks previously seen attacks vastly decreased the proportion of false negatives (top left).
# We will investigate further on how to improve accuracy with unseen attacks.

# Finally, we trained our models on the training dataset and classified the new record output by out SDN testbed.
CUSTOM_DATASET_PATH = "C:\\Users\\Marek\\PycharmProjects\\DP\\venv\\CustomTest.txt"

flow_data = pd.read_csv(DATASET_PATH, names=flow_data_headers)
selected_features = ['protocol','service','dst-host-count','dst-host-srv-count','dst-host-same-srv-rate','dst-host-same-src-port-rate','label']
flow_data = flow_data[selected_features]

# Load the record retrieved from the SDN testbed.
flow_data_test = pd.read_csv(CUSTOM_DATASET_PATH, names=selected_features)
print(flow_data_test.head())

  protocol service  dst-host-count  dst-host-srv-count  \
0      tcp    http               9                   9   

   dst-host-same-srv-rate  dst-host-same-src-port-rate  label  
0                     1.0                         0.11  other  


In [28]:
# Keep only Probe attacks labeled - label other entries as "other"
flow_data = flow_data.replace(['rootkit', 'ftp_write', 'buffer_overflow', 'loadmodule', 'spy', 'land', 'warezclient', 'phf', 'normal', 'pod', 'back', 'neptune', 'perl', 'imap', 'warezmaster', 'multihop', 'teardrop', 'smurf', 'guess_passwd'], 'other')

train_x = flow_data[selected_features[:-1]]
train_y = flow_data[selected_features[-1]]
test_x = flow_data_test[selected_features[:-1]]
test_y = flow_data_test[selected_features[-1]]

# One-hot encoding of categorical data, use StandardScaler for other data
columnTransformer = ColumnTransformer([('encoder', OneHotEncoder(), 
                                        ["protocol", "service"])], 
                                      remainder=StandardScaler())

# Train Random forest classifier
rfc = make_pipeline(columnTransformer, RandomForestClassifier(n_estimators=100))

start1 = time.process_time()
rfc.fit(train_x, train_y)
print('Time to train RFC: ' + str(time.process_time() - start1) + 's')

# Train logistic regression
mul_lr = make_pipeline(columnTransformer, linear_model.LogisticRegression(multi_class='ovr', solver='lbfgs', max_iter=20))

start2 = time.process_time()
mul_lr.fit(train_x, train_y)
print('Time to train LR: ' + str(time.process_time() - start2) + 's')

# Train Decision Tree classifier
dt = make_pipeline(columnTransformer, tree.DecisionTreeClassifier())

start3 = time.process_time()
dt.fit(train_x, train_y)
print('Time to train DT: ' + str(time.process_time() - start3) + 's')

trafficLabels = ['satan', 'ipsweep', 'portsweep', 'nmap', 'other']

print('Prediction by Random Forest: ' + rfc.predict(test_x))

print('Prediction by Logistic regression: ' + mul_lr.predict(test_x))

print('Prediction by Decision tree: ' + dt.predict(test_x))

Time to train RFC: 24.140625s
Time to train LR: 1.265625s
Time to train DT: 0.78125s
['Prediction by Random Forest: other']
['Prediction by Logistic regression: other']
['Prediction by Decision tree: other']


In [None]:
# All classifiers correctly classified the record as non-malicious.