In [56]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.tree import DecisionTreeClassifier

tcp_complete = pd.read_csv("../data/captura_prueba_MULTIPLESVPN_20180205_12h/tagger_results/tcp_complete_tagged_ok", delimiter=r"\s+",low_memory=False)

print("-------------> Original Dataset -- Features: " + str(len(tcp_complete.columns))+ " Traffic Flows: " + str(len(tcp_complete.index)))
print(tcp_complete.head())

# Mark vpn flows
vpn_ip = ['138.100.156.254','138.100.156.244','138.100.156.253']
tcp_complete['is_vpn'] = tcp_complete['s_ip:15'].isin(vpn_ip)

# Filter bad features
bad_features = [1, 2, 12, 15, 16, 18, 29, 30, 38, 39, 40, 41, 43, 50, 51, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 74, 75, 80, 81, 82, 84, 86, 87, 88, 89, 90, 91, 92, 97, 98, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131]
bad_features = [index - 1 for index in bad_features]
tcp_complete = tcp_complete.drop(tcp_complete.columns[bad_features], axis =1)


print("-------------> Filtered Dataset -- Features: " + str(len(tcp_complete.columns))+ " Traffic Flows: " + str(len(tcp_complete.index)))
print(tcp_complete.head())

print(tcp_complete.describe())


-------------> Original Dataset -- Features: 132 Traffic Flows: 147944
        #c_ip:1  c_port:2  c_pkts_all:3  c_rst_cnt:4  c_ack_cnt:5  \
0  172.16.1.172     41342            12            0           11   
1  172.16.1.175     49845            42            0           41   
2  172.16.1.175     52472             7            0            6   
3  172.16.1.175     48903            16            0           15   
4  172.16.1.175     40561            30            0           29   

   c_ack_cnt_p:6  c_bytes_uniq:7  c_pkts_data:8  c_bytes_all:9  \
0              6            2409              4           2409   
1             38             637              2            637   
2              3             299              1            299   
3             13             335              1            335   
4             25             790              3            790   

   c_pkts_retx:10   ...     c_appdataT:123  s_appdataT:124  c_appdataB:125  \
0               0   ...            463.

In [57]:
# Create training and test data
tcp_complete['is_train'] = np.random.uniform(0, 1, len(tcp_complete)) <= .75
train, test = tcp_complete[tcp_complete['is_train']==True], tcp_complete[tcp_complete['is_train']==False]
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 110970
Number of observations in the test data: 36974


In [58]:
# Create a list of the feature column's names
features = tcp_complete.columns[:-3] # remove the tag from the features used to train

# View features
print(features)
len(features)

Index(['c_cwin_ini:78'], dtype='object')


1

In [59]:
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(criterion = "gini", random_state = 100,
                               max_depth=3, min_samples_leaf=5)
clf.fit(train[features], train['tag:132'])

from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

with open("dt.dot", 'w') as f:
    export_graphviz(clf, out_file=f,
                        feature_names=features)

    command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
    try:
        subprocess.check_call(command)
    except:
        exit("Could not run dot, ie graphviz, to "
             "produce visualization")



In [60]:
# Apply the Classifier we trained to the test data (which, remember, it has never seen before)
preds = clf.predict(test[features])
print(preds)

[1 1 1 ... 1 1 1]


In [61]:
# View the predicted probabilities of the first 10 observations
clf.predict_proba(test[features])[0:10]

array([[9.89800899e-01, 6.67951188e-04, 9.53114965e-03],
       [9.89800899e-01, 6.67951188e-04, 9.53114965e-03],
       [9.89800899e-01, 6.67951188e-04, 9.53114965e-03],
       [9.89800899e-01, 6.67951188e-04, 9.53114965e-03],
       [9.89800899e-01, 6.67951188e-04, 9.53114965e-03],
       [9.89800899e-01, 6.67951188e-04, 9.53114965e-03],
       [9.89800899e-01, 6.67951188e-04, 9.53114965e-03],
       [9.89800899e-01, 6.67951188e-04, 9.53114965e-03],
       [9.93902439e-01, 2.40693196e-03, 3.69062901e-03],
       [3.61298780e-01, 6.32852677e-01, 5.84854291e-03]])

In [62]:
# Create confusion matrix
pd.crosstab(test['tag:132'], preds, rownames=['Actual Tags'], colnames=['Predicted tags'])

Predicted tags,1,2
Actual Tags,Unnamed: 1_level_1,Unnamed: 2_level_1
1,31687,1153
2,314,2964
3,848,8


In [63]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(test['tag:132'], preds)

print(f'Mean accuracy score: {accuracy:.9}')

Mean accuracy score: 0.937172067


In [64]:
# View a list of the features and their importance scores
sorted(list(zip(train[features], clf.feature_importances_)), key=lambda x: x[1], reverse=True)

[('c_cwin_ini:78', 1.0)]

In [65]:
# View a list of the features and their importance scores
test_vpn, test_novpn = test[test['is_vpn']==True], test[test['is_vpn']==False]

from sklearn.metrics import accuracy_score
accuracy_vpn = accuracy_score(test_vpn['tag:132'], preds[test['is_vpn']==True])
accuracy_novpn = accuracy_score(test_novpn['tag:132'], preds[test['is_vpn']==False])

print(f'Mean accuracy VPN score: {accuracy_vpn:.10}')
print(f'Mean accuracy no VPN score: {accuracy_novpn:.10}')

Mean accuracy VPN score: 0.0
Mean accuracy no VPN score: 0.9372227632
