forked from fanoping/NMLab-final
-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifierB.py
109 lines (88 loc) · 4.46 KB
/
classifierB.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score
import argparse
import json
import os
def main(args):
"""
Scenario B:
Feature extractor: CfsSubsetEval+BestFirst (SE+BF)
Classifier: K nearest neighboring (K = 3), decision tree classifier, random forest
Validation: k-folded cross validation
"""
csv_file = pd.read_csv(args.train_csv)
test_csv_file = pd.read_csv(args.test_csv)
config = json.load(open(args.config))['Scenario-B']
attributes = [csv_file[attr] for attr, usage in config["attribute"].items() if usage]
test_attributes = [test_csv_file[attr] for attr, usage in config["attribute"].items() if usage]
# labels
label = csv_file["Label"] if config["Label"] else ValueError("No label specified!")
labels = {'BROWSING': 0, 'AUDIO': 1, 'CHAT': 2, 'MAIL': 3, 'P2P': 4,
'FILE-TRANSFER': 5, 'VOIP': 6, 'VIDEO': 7}
train_x = np.array(attributes).T
train_label = [labels[item] for item in label]
train_label = np.array(train_label).T
test_x = np.array(test_attributes).T
def k_fold_cross_validation(k_fold, train_x, label):
split = np.array_split(train_x, k_fold)
split_label = np.array_split(label, k_fold)
for val_idx in range(k_fold):
train = [split[idx] for idx in range(len(split)) if idx != val_idx]
valid = split[val_idx]
train_label = [split_label[idx] for idx in range(len(split_label)) if idx != val_idx]
valid_label = split_label[val_idx]
yield train, valid, train_label, valid_label
splitted_data = list(k_fold_cross_validation(args.k, train_x, train_label))
if args.arch.lower() == 'knn':
neigh = KNeighborsClassifier(n_neighbors=3)
elif args.arch.lower() == 'tree':
neigh = DecisionTreeClassifier()
elif args.arch.lower() == 'forest':
neigh = RandomForestClassifier(n_estimators=20, random_state=2)
else:
raise NotImplementedError(args.arch)
total = 0
for idx, (train, valid, train_label, valid_label) in enumerate(splitted_data):
train = np.concatenate(train)
train_label = np.concatenate(train_label)
neigh.fit(train, train_label)
score = neigh.score(valid, valid_label)
# print('{} fold:'.format(idx + 1))
# print('\tAccuracy: {:.6f}'.format(score))
# print('\tPrecision: {:.6f}'.format(precision_score(valid_label, neigh.predict(valid), average='micro')))
# print('\tRecall: {:.6f}'.format(recall_score(valid_label, neigh.predict(valid), average='micro')))
total += score
print("{} folded validation average accuracy: {:.6f}".format(args.k, total / args.k))
output_data = {}
predict = neigh.predict(test_x)
attributes = [test_csv_file[attr] for attr, usage in config["info"].items() if usage]
attributes_name = [attr for attr, usage in config["info"].items() if usage][1:]
attributes = list(zip(*attributes))
for idx, attr in enumerate(attributes):
flow_id = attr[0]
attr = attr[1:]
output_data[flow_id] = {}
for index, name in enumerate(attributes_name):
output_data[flow_id][name] = attr[index]
output_data[flow_id]['Result'] = [key for key, value in labels.items() if predict[idx] == value][0]
_, filename = os.path.split(args.test_csv)
with open("output/scenarioB/{}.json".format(filename[:-4]), "w") as f:
json.dump(output_data, f, indent=4, sort_keys=False)
return output_data
if __name__ == '__main__':
parser = argparse.ArgumentParser('Classifier for Scenario B')
parser.add_argument('-k', default=20, type=int,
help='k folded cross validation')
parser.add_argument('--train-csv', default='CSV/Scenario-B/TimeBasedFeatures-10s-Layer2.csv',
help='input information from csv file')
parser.add_argument('--test-csv', default='realtime0.pcap_Flow.csv',
help='input information from csv file for testing')
parser.add_argument('--config', default='config.json',
help='specify the selected feature')
parser.add_argument('--arch', default='knn', type=str,
help='classification method [knn, tree, forest]')
main(parser.parse_args())