In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Load dataset as a data frame
tcp_complete = pd.read_csv("../data/captura_prueba_MULTIPLESVPN_20180206_BP_12h/tagger_results/tcp_complete_tagged_ok", delimiter=r"\s+",low_memory=False)
tcp_complete2 = pd.read_csv("../data/captura_prueba_MULTIPLESVPN_20180207_BP_12h/tagger_results/tcp_complete_tagged_ok", delimiter=r"\s+",low_memory=False)

print("-------------> Original Dataset -- Features: " + str(len(tcp_complete.columns))+ " Traffic Flows: " + str(len(tcp_complete.index)))
print(tcp_complete.head())

# Mark vpn flows
vpn_ip = ['138.100.156.254','138.100.156.244','138.100.156.253']
tcp_complete['is_vpn'] = tcp_complete['s_ip:15'].isin(vpn_ip)
tcp_complete2['is_vpn'] = tcp_complete2['s_ip:15'].isin(vpn_ip)

# Filter bad features
bad_features = [1, 2, 12, 15, 16, 18, 29, 30, 38, 39, 40, 41, 43, 50, 51, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 74, 75, 80, 81, 82, 84, 86, 87, 88, 89, 90, 91, 92, 97, 98, 102, 103, 104, 105, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131]
bad_features = [index - 1 for index in bad_features]

tcp_complete = tcp_complete.drop(tcp_complete.columns[bad_features], axis =1)
tcp_complete2 = tcp_complete2.drop(tcp_complete2.columns[bad_features], axis =1)

print("-------------> Filtered Dataset -- Features: " + str(len(tcp_complete.columns))+ " Traffic Flows: " + str(len(tcp_complete.index)))
print(tcp_complete.head())

# Get basic statistics
print(tcp_complete.describe())


-------------> Original Dataset -- Features: 132 Traffic Flows: 629886
      #c_ip:1  c_port:2  c_pkts_all:3  c_rst_cnt:4  c_ack_cnt:5  \
0  1.1.22.130      2464             7            0            6   
1   1.1.11.66      1744             7            0            6   
2    1.1.90.2      6784             7            0            6   
3    1.1.90.2     12544             5            0            4   
4   1.1.2.210      1114             7            0            6   

   c_ack_cnt_p:6  c_bytes_uniq:7  c_pkts_data:8  c_bytes_all:9  \
0              4              18              1             18   
1              4              18              1             18   
2              4              18              1             18   
3              2             156              1            156   
4              4              18              1             18   

   c_pkts_retx:10   ...     c_appdataT:123  s_appdataT:124  c_appdataB:125  \
0               0   ...                0.0         

In [2]:

# Create training and test data
if 'tcp_complete2' not in globals(): # divide the dataset; 75% training 
    tcp_complete['is_train'] = np.random.uniform(0, 1, len(tcp_complete)) <= .75
    train, test = tcp_complete[tcp_complete['is_train']==True], tcp_complete[tcp_complete['is_train']==False]
else: # use different captures for training and testing
    train =  tcp_complete
    test = tcp_complete2
    
# Show the number of observations for the test and training dataframes
print('Number of observations in the training data:', len(train))
print('Number of observations in the test data:',len(test))

Number of observations in the training data: 629886
Number of observations in the test data: 629665


In [3]:
# Create a list of the feature column's names
features = tcp_complete.columns[:-3] # remove the tag from the features used to train

# View features
print(features)
len(features)

Index(['c_pkts_all:3', 'c_rst_cnt:4', 'c_ack_cnt:5', 'c_ack_cnt_p:6',
       'c_bytes_uniq:7', 'c_pkts_data:8', 'c_bytes_all:9', 'c_pkts_retx:10',
       'c_bytes_retx:11', 'c_syn_cnt:13', 'c_fin_cnt:14', 's_pkts_all:17',
       's_ack_cnt:19', 's_ack_cnt_p:20', 's_bytes_uniq:21', 's_pkts_data:22',
       's_bytes_all:23', 's_pkts_retx:24', 's_bytes_retx:25', 's_pkts_ooo:26',
       's_syn_cnt:27', 's_fin_cnt:28', 'durat:31', 'c_first:32', 's_first:33',
       'c_last:34', 's_last:35', 'c_first_ack:36', 's_first_ack:37',
       'con_t:42', 'http_t:44', 'c_rtt_avg:45', 'c_rtt_min:46', 'c_rtt_max:47',
       'c_rtt_std:48', 'c_rtt_cnt:49', 's_rtt_avg:52', 's_rtt_min:53',
       's_rtt_max:54', 's_rtt_std:55', 's_rtt_cnt:56', 'c_mss_max:71',
       'c_mss_min:72', 'c_win_max:73', 'c_cwin_max:76', 'c_cwin_min:77',
       'c_cwin_ini:78', 'c_pkts_rto:79', 'c_pkts_unk:83', 'c_pkts_unrto:85',
       's_mss:93', 's_mss_max:94', 's_mss_min:95', 's_win_max:96',
       's_cwin_max:99', 's_cwin_mi

57

In [4]:
# Train The Random Forest Classifier
clf = RandomForestClassifier(n_jobs=4, n_estimators=100,random_state=0)
# Train the Classifier to take the training features and learn how they relate
# to the training the tags
clf.fit(train[features], train['tag:132'])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=4,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [5]:
# Apply the Classifier to the test data 
preds = clf.predict(test[features])
print(preds)

[0 0 0 ... 3 1 3]


In [7]:
# Create confusion matrix
pd.crosstab(test['tag:132'], preds, rownames=['Actual Tags'], colnames=['Predicted tags'])

Predicted tags,0,1,2,3
Actual Tags,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,485378,0,0,0
1,0,127204,39,2
2,0,139,13334,1
3,0,78,0,3490


In [44]:
from sklearn.metrics import *

print("\t Mean accuracy score: %1.6f" % accuracy_score(test['tag:132'], preds))
print("\t Mean Precision score: %1.6f" % precision_score(test['tag:132'], preds,average='micro'))
print("\t Mean Recall score: %1.3f" % recall_score(test['tag:132'], preds,average='micro'))
print("\t Mean F1 score: %1.3f\n" % f1_score(test['tag:132'], preds,average='micro'))

	 Mean accuracy score: 0.999589
	 Mean Precision score: 0.999589
	 Mean Recall score: 1.000
	 Mean F1 score: 1.000



In [22]:
# View a list of the features and their importance scores
sorted(list(zip(train[features], clf.feature_importances_)), key=lambda x: x[1], reverse=True)

[('s_mss:93', 0.1350088622911473),
 ('c_first_ack:36', 0.11909735635912892),
 ('s_first_ack:37', 0.1069175161429251),
 ('c_rtt_avg:45', 0.06758305832585204),
 ('durat:31', 0.06705810244358582),
 ('c_rtt_min:46', 0.06139909916285534),
 ('c_rtt_max:47', 0.06119859385942124),
 ('c_cwin_ini:78', 0.04312226244819201),
 ('s_first:33', 0.03612061855421266),
 ('c_first:32', 0.03578948738638146),
 ('s_last:35', 0.032235362445835886),
 ('c_last:34', 0.023421602980978534),
 ('c_win_max:73', 0.022109484007108337),
 ('c_cwin_max:76', 0.022035518493914562),
 ('s_rtt_avg:52', 0.020382315318621776),
 ('s_win_max:96', 0.020211844614923223),
 ('s_rtt_min:53', 0.018780927651492526),
 ('c_rtt_std:48', 0.018430519655632117),
 ('c_mss_max:71', 0.008829967843158865),
 ('s_mss_max:94', 0.007625501589693427),
 ('s_mss_min:95', 0.006946708632976874),
 ('s_cwin_min:100', 0.005722156134398368),
 ('con_t:42', 0.00568562906248507),
 ('c_pkts_retx:10', 0.005029775389765031),
 ('c_bytes_uniq:7', 0.004688849897739658)

In [43]:
# Compute the evaluation metrics for VPN and NO-VPN 
import warnings


test_vpn, test_novpn = tcp_complete2[tcp_complete2['is_vpn']==True], tcp_complete2[tcp_complete2['is_vpn']==False]


print("\t Evaluation Metrics  - VPN - Flows: " + str(len(test_vpn['tag:132'])) )
print("\t Mean Accuracy score: %1.6f" % accuracy_score(test_vpn['tag:132'], preds[tcp_complete2['is_vpn']==True]))
print("\t Mean Precision score: %1.6f" % precision_score(test_vpn['tag:132'], preds[tcp_complete2['is_vpn']==True],average='micro'))
print("\t Mean Recall score: %1.3f" % recall_score(test_vpn['tag:132'], preds[tcp_complete2['is_vpn']==True],average='micro'))
print("\t Mean F1 score: %1.3f\n" % f1_score(test_vpn['tag:132'], preds[tcp_complete2['is_vpn']==True],average='micro'))
      
print("\t Evaluation Metrics  - NO VPN - Flows: " + str(len(test_novpn['tag:132'])) )

print("\t Mean Accuracy score: %1.6f" % accuracy_score(test_novpn['tag:132'], preds[tcp_complete2['is_vpn']==False]))
print("\t Mean Precision score: %1.6f" % precision_score(test_novpn['tag:132'], preds[tcp_complete2['is_vpn']==False],average='micro'))
print("\t Mean Recall score: %1.3f" % recall_score(test_novpn['tag:132'], preds[tcp_complete2['is_vpn']==False],average='micro'))
print("\t Mean F1 score: %1.3f\n" % f1_score(test_novpn['tag:132'], preds[tcp_complete2['is_vpn']==False],average='micro'))

	 Evaluation Metrics  - VPN - Flows: 22
	 Mean Accuracy score: 0.954545
	 Mean Precision score: 0.954545
	 Mean Recall score: 0.955
	 Mean F1 score: 0.955

	 Evaluation Metrics  - NO VPN - Flows: 629643
	 Mean Accuracy score: 0.999590
	 Mean Precision score: 0.999590
	 Mean Recall score: 1.000
	 Mean F1 score: 1.000



In [41]:
# Create confusion matrix for VPN only

pd.crosstab(test_vpn['tag:132'], preds[tcp_complete2['is_vpn']==True], rownames=['Actual Tags'], colnames=['Predicted tags'])


Predicted tags,3
Actual Tags,Unnamed: 1_level_1
2,1
3,21


In [42]:
# Create confusion matrix for NO VPN only

pd.crosstab(test_novpn['tag:132'], preds[tcp_complete2['is_vpn']==False], rownames=['Actual Tags'], colnames=['Predicted tags'])

Predicted tags,0,1,2,3
Actual Tags,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,485378,0,0,0
1,0,127204,39,2
2,0,139,13334,0
3,0,78,0,3469
