In [1]:
from __future__ import print_function

import os
import subprocess

import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [2]:
def get_data():
    if os.path.exists("slow_loris_3.csv"):
        print("-- slow_loris.csv found locally")
    df = pd.read_csv("slow_loris_3.csv", index_col=0)
    return df

In [3]:
data = get_data()

-- slow_loris.csv found locally


In [4]:
print("* df.head()", data.head(), sep="\n", end="\n\n")
print("* df.tail()", data.tail(), sep="\n", end="\n\n")

* df.head()
            Flow Duration Flow Bytes/s  Flow Packets/s   Flow IAT Mean  \
 Protocol                                                                
6                 5385455  178.4436041     1.485482657    7.693507e+05   
6                     102  117647.0588     19607.84314    1.020000e+02   
17                    148  1162162.162     27027.02703    4.933333e+01   
6                 5231148  2.293951538     0.764650513    1.743716e+06   
6               115441263  90.31432721     0.398471039    2.565361e+06   

            Flow IAT Std   Flow IAT Max   Flow IAT Min  Fwd IAT Total  \
 Protocol                                                               
6           1.848212e+06        4959769             50         425686   
6           0.000000e+00            102            102            102   
17          4.954123e+01            100              1              1   
6           2.946470e+06        5145650             91        5231148   
6           4.328674e+06       

In [5]:
print("* label types:", data[" Label"].unique(), sep="\n")
print(data.keys())

* label types:
['BENIGN' 'DoS slowloris']
Index([u' Flow Duration', u'Flow Bytes/s', u' Flow Packets/s',
       u' Flow IAT Mean', u' Flow IAT Std', u' Flow IAT Max', u' Flow IAT Min',
       u'Fwd IAT Total', u' Fwd IAT Mean', u' Fwd IAT Std', u' Fwd IAT Max',
       u' Fwd IAT Min', u'Bwd IAT Total', u' Bwd IAT Mean', u' Bwd IAT Std',
       u' Bwd IAT Max', u' Bwd IAT Min', u'Fwd Packets/s', u' Bwd Packets/s',
       u'Init_Win_bytes_forward', u' Init_Win_bytes_backward',
       u' act_data_pkt_fwd', u' min_seg_size_forward', u' Label'],
      dtype='object')


In [6]:
def encode_target(df, target_column):
    
    df_mod = df.copy()
    targets = df_mod[target_column].unique()
    map_to_int = {name: n for n, name in enumerate(targets)}
    df_mod["Target"] = df_mod[target_column].replace(map_to_int)

    return (df_mod, targets)

In [7]:
df2, targets = encode_target(data, " Label")

print("* df2.head()", df2[["Target", " Label"]].head(), sep="\n", end="\n\n")
print("* df2.tail()", df2[["Target", " Label"]].tail(), sep="\n", end="\n\n")


print("* targets", targets, sep="\n", end="\n\n")


* df2.head()
           Target   Label
 Protocol                
6               0  BENIGN
6               0  BENIGN
17              0  BENIGN
6               0  BENIGN
6               0  BENIGN

* df2.tail()
           Target          Label
 Protocol                       
6               1  DoS slowloris
6               1  DoS slowloris
6               1  DoS slowloris
6               1  DoS slowloris
6               1  DoS slowloris

* targets
['BENIGN' 'DoS slowloris']



In [8]:
features = list(df2.columns[:-2])
print("* features:", features, sep="\n")

* features:
[' Flow Duration', 'Flow Bytes/s', ' Flow Packets/s', ' Flow IAT Mean', ' Flow IAT Std', ' Flow IAT Max', ' Flow IAT Min', 'Fwd IAT Total', ' Fwd IAT Mean', ' Fwd IAT Std', ' Fwd IAT Max', ' Fwd IAT Min', 'Bwd IAT Total', ' Bwd IAT Mean', ' Bwd IAT Std', ' Bwd IAT Max', ' Bwd IAT Min', 'Fwd Packets/s', ' Bwd Packets/s', 'Init_Win_bytes_forward', ' Init_Win_bytes_backward', ' act_data_pkt_fwd', ' min_seg_size_forward']


In [9]:
y = df2["Target"]
X = df2[features]


In [10]:
X

Unnamed: 0_level_0,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,...,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd Packets/s,Bwd Packets/s,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward
Protocol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,5385455,178.4436041,1.485482657,7.693507e+05,1.848212e+06,4959769,50,425686,1.418953e+05,7.637429e+04,...,1.766952e+06,2.800394e+06,5000116,103549,0.742741,0.742741,8192,60,3,20
6,102,117647.0588,19607.84314,1.020000e+02,0.000000e+00,102,102,102,1.020000e+02,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,19607.843140,0.000000,254,-1,1,20
17,148,1162162.162,27027.02703,4.933333e+01,4.954123e+01,100,1,1,1.000000e+00,0.000000e+00,...,4.700000e+01,0.000000e+00,47,47,13513.513510,13513.513510,-1,-1,1,32
6,5231148,2.293951538,0.764650513,1.743716e+06,2.946470e+06,5145650,91,5231148,2.615574e+06,3.578068e+06,...,0.000000e+00,0.000000e+00,0,0,0.573488,0.191163,8192,29200,2,20
6,115441263,90.31432721,0.398471039,2.565361e+06,4.328674e+06,10000000,9,115000000,4.808513e+06,4.991519e+06,...,5.770222e+06,4.938970e+06,10100000,48,0.216560,0.181911,29200,377,5,32
6,4,7750000,500000,4.000000e+00,0.000000e+00,4,4,4,4.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,500000.000000,0.000000,406,-1,0,32
6,2481084,845.5981337,5.642694887,1.908526e+05,5.484545e+05,1963917,27,517167,8.619450e+04,1.908697e+05,...,4.134557e+05,8.000146e+05,2000232,491,2.821347,2.821347,29200,8,3,32
6,4,7750000,500000,4.000000e+00,0.000000e+00,4,4,4,4.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,500000.000000,0.000000,245,-1,0,32
6,5241434,2.289449796,0.763149932,1.747145e+06,3.009075e+06,5221707,121,5241434,2.620717e+06,3.678355e+06,...,0.000000e+00,0.000000e+00,0,0,0.572362,0.190787,29200,29200,2,20
17,954,212788.26,2096.436059,9.540000e+02,0.000000e+00,954,954,0,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,1048.218029,1048.218029,-1,-1,0,20


In [11]:
X.dtypes
col = X.columns[X.dtypes.eq(object)]
X[col] = X[col].apply(pd.to_numeric, errors='coerce', axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


In [12]:
X.dtypes

 Flow Duration                int64
Flow Bytes/s                float64
 Flow Packets/s             float64
 Flow IAT Mean              float64
 Flow IAT Std               float64
 Flow IAT Max                 int64
 Flow IAT Min                 int64
Fwd IAT Total                 int64
 Fwd IAT Mean               float64
 Fwd IAT Std                float64
 Fwd IAT Max                  int64
 Fwd IAT Min                  int64
Bwd IAT Total                 int64
 Bwd IAT Mean               float64
 Bwd IAT Std                float64
 Bwd IAT Max                  int64
 Bwd IAT Min                  int64
Fwd Packets/s               float64
 Bwd Packets/s              float64
Init_Win_bytes_forward        int64
 Init_Win_bytes_backward      int64
 act_data_pkt_fwd             int64
 min_seg_size_forward         int64
dtype: object

In [13]:
X

Unnamed: 0_level_0,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,...,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd Packets/s,Bwd Packets/s,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward
Protocol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,5385455,1.784436e+02,1.485483,7.693507e+05,1.848212e+06,4959769,50,425686,1.418953e+05,7.637429e+04,...,1.766952e+06,2.800394e+06,5000116,103549,0.742741,0.742741,8192,60,3,20
6,102,1.176471e+05,19607.843140,1.020000e+02,0.000000e+00,102,102,102,1.020000e+02,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,19607.843140,0.000000,254,-1,1,20
17,148,1.162162e+06,27027.027030,4.933333e+01,4.954123e+01,100,1,1,1.000000e+00,0.000000e+00,...,4.700000e+01,0.000000e+00,47,47,13513.513510,13513.513510,-1,-1,1,32
6,5231148,2.293952e+00,0.764651,1.743716e+06,2.946470e+06,5145650,91,5231148,2.615574e+06,3.578068e+06,...,0.000000e+00,0.000000e+00,0,0,0.573488,0.191163,8192,29200,2,20
6,115441263,9.031433e+01,0.398471,2.565361e+06,4.328674e+06,10000000,9,115000000,4.808513e+06,4.991519e+06,...,5.770222e+06,4.938970e+06,10100000,48,0.216560,0.181911,29200,377,5,32
6,4,7.750000e+06,500000.000000,4.000000e+00,0.000000e+00,4,4,4,4.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,500000.000000,0.000000,406,-1,0,32
6,2481084,8.455981e+02,5.642695,1.908526e+05,5.484545e+05,1963917,27,517167,8.619450e+04,1.908697e+05,...,4.134557e+05,8.000146e+05,2000232,491,2.821347,2.821347,29200,8,3,32
6,4,7.750000e+06,500000.000000,4.000000e+00,0.000000e+00,4,4,4,4.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,500000.000000,0.000000,245,-1,0,32
6,5241434,2.289450e+00,0.763150,1.747145e+06,3.009075e+06,5221707,121,5241434,2.620717e+06,3.678355e+06,...,0.000000e+00,0.000000e+00,0,0,0.572362,0.190787,29200,29200,2,20
17,954,2.127883e+05,2096.436059,9.540000e+02,0.000000e+00,954,954,0,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,1048.218029,1048.218029,-1,-1,0,20


In [14]:
#X = X.as_matrix().astype(np.float)
#y = y.as_matrix().astype(np.float)


In [15]:
X

Unnamed: 0_level_0,Flow Duration,Flow Bytes/s,Flow Packets/s,Flow IAT Mean,Flow IAT Std,Flow IAT Max,Flow IAT Min,Fwd IAT Total,Fwd IAT Mean,Fwd IAT Std,...,Bwd IAT Mean,Bwd IAT Std,Bwd IAT Max,Bwd IAT Min,Fwd Packets/s,Bwd Packets/s,Init_Win_bytes_forward,Init_Win_bytes_backward,act_data_pkt_fwd,min_seg_size_forward
Protocol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,5385455,1.784436e+02,1.485483,7.693507e+05,1.848212e+06,4959769,50,425686,1.418953e+05,7.637429e+04,...,1.766952e+06,2.800394e+06,5000116,103549,0.742741,0.742741,8192,60,3,20
6,102,1.176471e+05,19607.843140,1.020000e+02,0.000000e+00,102,102,102,1.020000e+02,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,19607.843140,0.000000,254,-1,1,20
17,148,1.162162e+06,27027.027030,4.933333e+01,4.954123e+01,100,1,1,1.000000e+00,0.000000e+00,...,4.700000e+01,0.000000e+00,47,47,13513.513510,13513.513510,-1,-1,1,32
6,5231148,2.293952e+00,0.764651,1.743716e+06,2.946470e+06,5145650,91,5231148,2.615574e+06,3.578068e+06,...,0.000000e+00,0.000000e+00,0,0,0.573488,0.191163,8192,29200,2,20
6,115441263,9.031433e+01,0.398471,2.565361e+06,4.328674e+06,10000000,9,115000000,4.808513e+06,4.991519e+06,...,5.770222e+06,4.938970e+06,10100000,48,0.216560,0.181911,29200,377,5,32
6,4,7.750000e+06,500000.000000,4.000000e+00,0.000000e+00,4,4,4,4.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,500000.000000,0.000000,406,-1,0,32
6,2481084,8.455981e+02,5.642695,1.908526e+05,5.484545e+05,1963917,27,517167,8.619450e+04,1.908697e+05,...,4.134557e+05,8.000146e+05,2000232,491,2.821347,2.821347,29200,8,3,32
6,4,7.750000e+06,500000.000000,4.000000e+00,0.000000e+00,4,4,4,4.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,500000.000000,0.000000,245,-1,0,32
6,5241434,2.289450e+00,0.763150,1.747145e+06,3.009075e+06,5221707,121,5241434,2.620717e+06,3.678355e+06,...,0.000000e+00,0.000000e+00,0,0,0.572362,0.190787,29200,29200,2,20
17,954,2.127883e+05,2096.436059,9.540000e+02,0.000000e+00,954,954,0,0.000000e+00,0.000000e+00,...,0.000000e+00,0.000000e+00,0,0,1048.218029,1048.218029,-1,-1,0,20


In [16]:
np.where(np.isnan(X))
np.where(np.isnan(y))

(array([], dtype=int64),)

In [17]:
X = np.nan_to_num(X)
y = np.nan_to_num(y)
np.where(np.isnan(X))

(array([], dtype=int64), array([], dtype=int64))

In [18]:
dt = DecisionTreeClassifier(min_samples_split=20, random_state=99)

In [19]:
dt.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=20, min_weight_fraction_leaf=0.0,
            presort=False, random_state=99, splitter='best')

In [20]:
def visualize_tree(tree, feature_names):
    """Create tree png using graphviz.

    Args
    ----
    tree -- scikit-learn DecsisionTree.
    feature_names -- list of feature names.
    """
    with open("dt.dot", 'w') as f:
        export_graphviz(tree, out_file=f,
                        feature_names=feature_names)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")


In [21]:
visualize_tree(dt, features)

In [22]:
def get_code(tree, feature_names, target_names,
             spacer_base="    "):
    """Produce psuedo-code for decision tree.

    Args
    ----
    tree -- scikit-leant DescisionTree.
    feature_names -- list of feature names.
    target_names -- list of target (class) names.
    spacer_base -- used for spacing code (default: "    ").

    Notes
    -----
    based on http://stackoverflow.com/a/30104792.
    """
    left      = tree.tree_.children_left
    right     = tree.tree_.children_right
    threshold = tree.tree_.threshold
    features  = [feature_names[i] for i in tree.tree_.feature]
    value = tree.tree_.value

    def recurse(left, right, threshold, features, node, depth):
        spacer = spacer_base * depth
        if (threshold[node] != -2):
            print(spacer + "if ( " + features[node] + " <= " + \
                  str(threshold[node]) + " ) {")
            if left[node] != -1:
                    recurse(left, right, threshold, features,
                            left[node], depth+1)
            print(spacer + "}\n" + spacer +"else {")
            if right[node] != -1:
                    recurse(left, right, threshold, features,
                            right[node], depth+1)
            print(spacer + "}")
        else:
            target = value[node]
            for i, v in zip(np.nonzero(target)[1],
                            target[np.nonzero(target)]):
                target_name = target_names[i]
                target_count = int(v)
                print(spacer + "return " + str(target_name) + \
                      " ( " + str(target_count) + " examples )")

    recurse(left, right, threshold, features, 0, 0)


In [23]:
get_code(dt, features, targets)

if (  Flow Packets/s <= 0.2101586014032364 ) {
    if (  Flow IAT Min <= 254.5 ) {
        if (  Fwd IAT Std <= 16450000.0 ) {
            if (  Flow IAT Max <= 10350000.0 ) {
                return BENIGN ( 7 examples )
            }
            else {
                return DoS slowloris ( 834 examples )
            }
        }
        else {
            if (  Flow Packets/s <= 0.0531567707657814 ) {
                return DoS slowloris ( 1 examples )
            }
            else {
                return BENIGN ( 68 examples )
            }
        }
    }
    else {
        return BENIGN ( 227 examples )
    }
}
else {
    if (  Flow IAT Min <= 984937.5 ) {
        if (  Bwd IAT Mean <= 25000000.0 ) {
            if (  Bwd Packets/s <= 14545.646484375 ) {
                if (  Bwd Packets/s <= 201.56625366210938 ) {
                    if (  Init_Win_bytes_backward <= 27924.0 ) {
                        if ( Init_Win_bytes_forward <= 29080.0 ) {
                            return 

In [24]:
dt_output = dt.predict(X)


In [26]:
df_dtout = pd.DataFrame(y, columns=['True'])
df_dtout['Pred'] = dt_output

In [27]:
df_dtout

Unnamed: 0,True,Pred
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
5,0,0
6,0,0
7,0,0
8,0,0
9,0,0


In [28]:
from sklearn.metrics import accuracy_score

In [30]:
accuracy_score(y, dt_output)*100

99.95951007186962