# パケット異常検知

## データの準備

* kddcupのデータをインポートし，それぞれの特徴に対して特徴名を付与する

In [2]:
>>> import pandas
>>> 
>>> col_names = ["duration","protocol_type","service","flag","src_bytes",
...    "dst_bytes","land","wrong_fragment","urgent","hot","num_failed_logins",
...    "logged_in","num_compromised","root_shell","su_attempted","num_root",
...    "num_file_creations","num_shells","num_access_files","num_outbound_cmds",
...    "is_host_login","is_guest_login","count","srv_count","serror_rate",
...    "srv_serror_rate","rerror_rate","srv_rerror_rate","same_srv_rate",
...    "diff_srv_rate","srv_diff_host_rate","dst_host_count","dst_host_srv_count",
...    "dst_host_same_srv_rate","dst_host_diff_srv_rate","dst_host_same_src_port_rate",
...    "dst_host_srv_diff_host_rate","dst_host_serror_rate","dst_host_srv_serror_rate",
...    "dst_host_rerror_rate","dst_host_srv_rerror_rate","label"]
>>> 
>>> kdd_data_10percent = pandas.read_csv("kddcup99/kddcup.data_10_percent", header=None, names = col_names)

* kdd_data_10percentのprotocol_typeを参照

In [3]:
print(kdd_data_10percent.protocol_type)

0         tcp
1         tcp
2         tcp
3         tcp
4         tcp
5         tcp
6         tcp
7         tcp
8         tcp
9         tcp
10        tcp
11        tcp
12        tcp
13        tcp
14        tcp
15        tcp
16        tcp
17        tcp
18        tcp
19        tcp
20        tcp
21        tcp
22        tcp
23        tcp
24        tcp
25        tcp
26        tcp
27        tcp
28        tcp
29        tcp
         ... 
493991    tcp
493992    tcp
493993    tcp
493994    tcp
493995    tcp
493996    tcp
493997    tcp
493998    tcp
493999    tcp
494000    tcp
494001    tcp
494002    tcp
494003    tcp
494004    tcp
494005    tcp
494006    tcp
494007    tcp
494008    tcp
494009    tcp
494010    tcp
494011    tcp
494012    tcp
494013    tcp
494014    tcp
494015    tcp
494016    tcp
494017    tcp
494018    tcp
494019    tcp
494020    tcp
Name: protocol_type, Length: 494021, dtype: object


### データの整形

フィーチャの要素のうち文字列のものにダミー変数を割り振り，数値化する．**(例) http,https=>0,1 など**  
文字列で構成されるフィーチャはprotocol_type，service_list，flag_listの３種であるのが目視で確認できたので，これらに対して適用する．　　

* それぞれのフィーチャの集合を求める => to_set_elems関数
* フィーチャの集合に番号を振る => add_to_index関数
 - この番号がダミー変数となる

In [53]:
protocol_type = kdd_data_10percent['protocol_type']
service = kdd_data_10percent['service']
flag = kdd_data_10percent['flag']

protocol_type_set = service_set = flag_set = []

def to_set_elems(data):
    elems_list = []
    for elem in data:
        elems_list.append(elem)

    return set(elems_list)

def count_set_elems(data):
    return len(to_set_elems(data))

def add_to_index(data):
    temp = []
    for (idx, elem) in enumerate(data):
        temp.append((elem, idx))
    
    return dict(temp)
        
protocol_type_count, service_count, flag_count \
        = map(count_set_elems, (protocol_type, service, flag))
# print(protocol_type_counts, service_counts, flag_counts) # => (3, 66, 11)
protocol_type_set, service_set, flag_set\
        = map(to_set_elems, (protocol_type, service, flag))
# print(protocol_type_set) # => set(['udp', 'icmp', 'tcp'])

print(add_to_index(service_set))



{'domain': 0, 'http_443': 46, 'Z39_50': 3, 'smtp': 4, 'urp_i': 2, 'private': 6, 'echo': 7, 'shell': 34, 'red_i': 9, 'eco_i': 10, 'sunrpc': 11, 'ftp_data': 12, 'urh_i': 13, 'pm_dump': 14, 'pop_3': 15, 'pop_2': 16, 'systat': 17, 'ftp': 18, 'uucp': 19, 'whois': 20, 'netbios_dgm': 21, 'efs': 22, 'remote_job': 23, 'daytime': 25, 'ntp_u': 26, 'finger': 27, 'ldap': 28, 'netbios_ns': 29, 'kshell': 30, 'iso_tsap': 31, 'ecr_i': 32, 'nntp': 33, 'printer': 8, 'domain_u': 35, 'uucp_path': 36, 'courier': 37, 'exec': 38, 'time': 61, 'netstat': 40, 'telnet': 41, 'gopher': 5, 'rje': 42, 'sql_net': 24, 'link': 44, 'auth': 45, 'netbios_ssn': 1, 'csnet_ns': 47, 'X11': 48, 'IRC': 49, 'tftp_u': 50, 'login': 63, 'supdup': 52, 'name': 53, 'nnsp': 54, 'mtp': 55, 'http': 56, 'bgp': 57, 'ctf': 58, 'hostnames': 43, 'klogin': 59, 'vmnet': 60, 'tim_i': 39, 'discard': 62, 'imap4': 51, 'other': 64, 'ssh': 65}
