In [None]:
from utils import pcap2csv_by_dpkt
import os

# filename = "dataset/benign_small.pcapng"
# pcap2csv_by_dpkt(filename, pcapng=True)

# dir_name = "attack_set"
dir_name = "cic-ids"
for filename in os.listdir(dir_name):
    if filename.endswith("pcap"):
        print(filename)
        pcap2csv_by_dpkt(os.path.join(dir_name, filename))
    elif filename.endswith("pcapng"):
        print(filename)
        pcap2csv_by_dpkt(os.path.join(dir_name, filename), pcapng=True)

In [3]:
pcap2csv_by_dpkt("cic-ids-benign/benign.pcapng", pcapng=True)

In [3]:
import os
filenames = list(filter(lambda x: x.endswith(".pcap") or x.endswith(".pcapng"), os.listdir("dataset_lite")))
for filename in filenames:
    file_path = os.path.join("dataset_lite", filename)
    pcapng = file_path.endswith(".pcapng")
    pcap2csv_by_dpkt(file_path, pcapng=pcapng)

dataset_lite/osscan.pcap
dataset_lite/SQL_Injection.pcap
dataset_lite/BruteForce-Web.pcap
dataset_lite/ssldosA10only.pcap
dataset_lite/BruteForce-XSS.pcap
dataset_lite/infiltration.pcap


ValueError: invalid tcpdump header

test pcap parsing functions

In [18]:
import dpkt
import socket
import struct
import os
import pandas as pd

type_identity_mp = {
    "TYPE_TCP_SYN": 1,
    "TYPE_TCP_FIN": 40,
    "TYPE_TCP_RST": 1,
    "TYPE_TCP_ACK": 1000,
    "TYPE_TCP": 1000,
    "TYPE_UDP": 3,
    "TYPE_ICMP": 10,
    "TYPE_IGMP": 9,
    "TYPE_UNKNOWN": 10,
}

In [45]:
def pcap2csv(filename: str, save_path: str = None, pcapng: bool = False) -> None:
    if not pcapng:
        fpcap = dpkt.pcap.Reader(open(filename, "rb"))
        suffix = ".pcap"
    else:
        fpcap = dpkt.pcapng.Reader(open(filename, "rb"))
        suffix = ".pcapng"
    all_fields = []
    for ts, buf in fpcap:
        eth = dpkt.ethernet.Ethernet(buf)
        if eth.type != dpkt.ethernet.ETH_TYPE_IP:
            continue
        ip = eth.data
        src_port = ""
        dst_port = ""
        protocol = ip.p
        proto_code = type_identity_mp["TYPE_UNKNOWN"]
        if ip.p == dpkt.ip.IP_PROTO_TCP:
            tcp = ip.data
            src_port = tcp.sport
            dst_port = tcp.dport
            # check tcp flags
            if tcp.flags & dpkt.tcp.TH_SYN:
                proto_code = type_identity_mp["TYPE_TCP_SYN"]
            elif tcp.flags & dpkt.tcp.TH_FIN:
                proto_code = type_identity_mp["TYPE_TCP_FIN"]
            elif tcp.flags & dpkt.tcp.TH_RST:
                proto_code = type_identity_mp["TYPE_TCP_RST"]
            else:
                proto_code = type_identity_mp["TYPE_TCP"]
        elif ip.p == dpkt.ip.IP_PROTO_UDP:
            udp = ip.data
            if not isinstance(udp, dpkt.udp.UDP):
                print("udp is not instance of dpkt.udp.UDP")
                print(f"protocol: {ip.p}")
                continue
            src_port = udp.sport
            dst_port = udp.dport
            proto_code = type_identity_mp["TYPE_UDP"]
        else:
            proto_code = type_identity_mp["TYPE_UNKNOWN"]

        src_addr = struct.unpack("!I", ip.src)[0]
        dst_addr = struct.unpack("!I", ip.dst)[0]
        src_ip = socket.inet_ntoa(ip.src)
        dst_ip = socket.inet_ntoa(ip.dst)
        pkt_length = len(buf)
        all_fields.append([src_addr, dst_addr, src_ip, dst_ip, src_port, dst_port, protocol, proto_code, pkt_length, ts])
    if save_path is None:
        try:
            os.makedirs(os.path.dirname(filename), exist_ok=True)
        except:
            pass
    sp = save_path if save_path is not None else filename.replace(suffix, ".csv")
    pd.DataFrame(all_fields).to_csv(sp, sep=","
                , header=["src_addr", "dst_addr", "src_ip", "dst_ip", "src_port", "dst_port", 
                        "protocol", "proto_code", "pkt_length", "timestamp"], index=False)

In [41]:
from scapy.all import IP, TCP, UDP, PcapReader, PcapNgReader

def pcap2csv_by_scapy(filename: str, save_path: str = None, pcapng: bool = False):
    if not pcapng:
        fpcap = PcapReader(filename)
        suffix = ".pcap"
    else:
        fpcap = PcapNgReader(filename)
        suffix = ".pcapng"
    all_fields = []
    for pkt in fpcap:
        if pkt.haslayer(IP):
            ip = pkt[IP]
            src_ip = ip.src
            dst_ip = ip.dst
            src_port = ""
            dst_port = ""
            protocol = ip.proto
            proto_code = type_identity_mp["TYPE_UNKNOWN"]
            if ip.proto == 6:
                tcp = pkt[TCP]
                src_port = tcp.sport
                dst_port = tcp.dport
                # check tcp flags
                if tcp.flags & 0x02:
                    proto_code = type_identity_mp["TYPE_TCP_SYN"]
                elif tcp.flags & 0x01:
                    proto_code = type_identity_mp["TYPE_TCP_FIN"]
                elif tcp.flags & 0x04:
                    proto_code = type_identity_mp["TYPE_TCP_RST"]
                else:
                    proto_code = type_identity_mp["TYPE_TCP"]
            elif ip.proto == 17:
                if not pkt.haslayer(UDP):
                    print(f"invalid udp packet: {pkt.show()}")
                    continue
                udp = pkt[UDP]
                src_port = udp.sport
                dst_port = udp.dport
                proto_code = type_identity_mp["TYPE_UDP"]
                # print(f"udp sport: {udp.sport} -> dport: {udp.dport}")
            else:
                proto_code = type_identity_mp["TYPE_UNKNOWN"]
            pkt_length = pkt.len
        ts = pkt.time
        all_fields.append([src_ip, dst_ip, src_port, dst_port, protocol, proto_code, pkt_length, ts])
    if save_path is None:
        try:
            os.makedirs(os.path.dirname(filename), exist_ok=True)
        except:
            pass
    sp = save_path if save_path is not None else filename.replace(suffix, ".csv")
    pd.DataFrame(all_fields).to_csv(sp, sep=","
                , header=["src_ip", "dst_ip", "src_port", "dst_port", 
                        "protocol",  "proto_code", "pkt_length", "timestamp"], index=False)

In [None]:
# from utils import pcap2csv
import os
import pandas as pd

pcap_files = os.listdir("dataset")
for pcap_file in pcap_files:
    filename = os.path.join("dataset", pcap_file)
    print(f"{filename} is being processed...")
    pcap2csv(filename)

In [1]:
import os
import pandas as pd
from utils import pcap2csv

filename = os.path.join("train_set", "benign_test.pcapng")
# filename = os.path.join("train_set", "benign.pcapng")
# filename = os.path.join("dataset", "cut20200610.pcapng")
pcap2csv(filename, pcapng=True)
# pcap2csv_by_scapy(filename, pcapng=True)

udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance 

In [6]:
df = pd.read_csv("dataset/osscan.csv")
group_protocol = df.groupby("protocol")
print(group_protocol.size())

protocol
1       25
6     2306
17       6
dtype: int64


parse pcap by tshark

In [4]:
import subprocess
import os

input_file = os.path.join("dataset", "osscan.pcap")
output_file = input_file.replace(".pcap", ".csv")
fields = "-e frame.time_epoch -e frame.len -e eth.src -e eth.dst -e ip.src -e ip.dst -e ip.proto -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e ipv6.src -e ipv6.dst"
cmd = f"tshark -r {input_file}  -T fields {fields} -E header=y -E separator=, > {output_file}"
print(cmd)
subprocess.call(cmd, shell=True)

tshark -r dataset/osscan.pcap  -T fields -e frame.time_epoch -e frame.len -e eth.src -e eth.dst -e ip.src -e ip.dst -e ip.proto -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e ipv6.src -e ipv6.dst -E header=y -E separator=, > dataset/osscan.csv


Running as user "root" and group "root". This could be dangerous.


0

preprocess data

In [None]:
import os
import pandas as pd
from utils import pcap2csv

filenames = os.listdir("dataset_lite")
for filename in filenames:
    file_path = os.path.join("dataset_lite", filename)
    print(f"{file_path} is being processed...")
    pcap2csv(file_path)

In [1]:
import os
from utils import pcap2csv_by_tshark

filename = os.path.join("dataset", "cut20200610.pcap")
# pcap2csv(filename)
pcap2csv_by_tshark(filename)

tshark -r dataset/cut20200610.pcap  -T fields -e frame.time_epoch -e frame.len -e eth.src -e eth.dst -e ip.src -e ip.dst -e ip.proto -e tcp.srcport -e tcp.dstport -e udp.srcport -e udp.dstport -e ipv6.src -e ipv6.dst -E header=y -E separator=, > dataset/cut20200610.csv


Running as user "root" and group "root". This could be dangerous.


In [3]:
import os
import pandas as pd

csv_files = list(filter(lambda x: x.endswith(".csv"), os.listdir("dataset_lite")))
for csv_file in csv_files:
    df = pd.read_csv(os.path.join("dataset_lite", csv_file))
    group_protocol = df.groupby(["src_ip"])
    print(f"{csv_file}")
    print(group_protocol.size())

osscan.csv
src_ip
10.0.0.1    1207
10.0.0.4    1130
dtype: int64
ssldosA10only.csv
src_ip
10.0.0.1    50288
dtype: int64
BruteForce-Web.csv
src_ip
172.31.69.28      7388
18.218.115.60    10864
dtype: int64
SQL_Injection.csv
src_ip
172.31.69.28      75
18.218.115.60    103
dtype: int64
BruteForce-XSS.csv
src_ip
172.31.69.28     3972
18.218.115.60    7727
dtype: int64


In [4]:
csv_files = list(filter(lambda x: x.endswith(".csv"), os.listdir("dataset_lite")))
for csv_file in csv_files:
    df = pd.read_csv(os.path.join("dataset_lite", csv_file))
    group_protocol = df.groupby(["src_addr", "dst_addr", "src_port", "dst_port", "protocol"])
    print(f"{csv_file}")
    print(group_protocol.size())

osscan.csv
src_addr   dst_addr    src_port  dst_port  protocol
167772161  167772164   33900.0   22.0      6            5
                       38485.0   22.0      6           10
                       38486.0   22.0      6           10
                       38487.0   22.0      6           10
                       38488.0   22.0      6           10
                                                       ..
167772164  167772161   64680.0   41237.0   6            1
                       65000.0   41237.0   6            1
                       65129.0   41237.0   6            1
                       65389.0   41237.0   6            1
           3758096635  5353.0    5353.0    17           1
Length: 2044, dtype: int64
ssldosA10only.csv
src_addr   dst_addr   src_port  dst_port  protocol
167772161  167772162  34508     443       6               4
                      36386     443       6               4
                      40410     443       6               4
                      4