In [6]:
type_identity_mp = {
    "TYPE_TCP_SYN": 1,
    "TYPE_TCP_FIN": 40,
    "TYPE_TCP_RST": 1,
    "TYPE_TCP_ACK": 1000,
    "TYPE_TCP": 1000,
    "TYPE_UDP": 3,
    "TYPE_ICMP": 10,
    "TYPE_IGMP": 9,
    "TYPE_UNKNOWN": 10,
}

In [4]:
import dpkt
import pandas as pd
import socket
import struct
import os

def pcap2csv_by_dpkt(filename: str, save_path: str = None, pcapng: bool = False) -> None:
    if not pcapng:
        fpcap = dpkt.pcap.Reader(open(filename, "rb"))
        suffix = ".pcap"
    else:
        fpcap = dpkt.pcapng.Reader(open(filename, "rb"))
        suffix = ".pcapng"
    all_fields = []
    for ts, buf in fpcap:
        eth = dpkt.ethernet.Ethernet(buf)
        if eth.type != dpkt.ethernet.ETH_TYPE_IP:
            continue
        ip = eth.data
        src_port = ""
        dst_port = ""
        tos = ip.tos
        id = ip.id
        ttl = ip.ttl
        chksum = ip.sum
        flags = ip._flags_offset
        payload = ""
        tcp_window = ""
        tcp_dataoffset = ""
        udp_length = ""
        protocol = ip.p
        proto_code = type_identity_mp["TYPE_UNKNOWN"]
        if ip.p == dpkt.ip.IP_PROTO_TCP:
            tcp = ip.data
            payload = tcp.data
            src_port = tcp.sport
            dst_port = tcp.dport
            tcp_window = tcp.win
            tcp_dataoffset = tcp.off
            # check tcp flags
            if tcp.flags & dpkt.tcp.TH_SYN:
                proto_code = type_identity_mp["TYPE_TCP_SYN"]
            elif tcp.flags & dpkt.tcp.TH_FIN:
                proto_code = type_identity_mp["TYPE_TCP_FIN"]
            elif tcp.flags & dpkt.tcp.TH_RST:
                proto_code = type_identity_mp["TYPE_TCP_RST"]
            else:
                proto_code = type_identity_mp["TYPE_TCP"]
        elif ip.p == dpkt.ip.IP_PROTO_UDP:
            udp = ip.data
            if not isinstance(udp, dpkt.udp.UDP):
                print("udp is not instance of dpkt.udp.UDP")
                print(f"protocol: {ip.p}")
                continue
            payload = udp.data
            src_port = udp.sport
            dst_port = udp.dport
            udp_length = udp.ulen
            proto_code = type_identity_mp["TYPE_UDP"]
        else:
            proto_code = type_identity_mp["TYPE_UNKNOWN"]

        src_addr = struct.unpack("!I", ip.src)[0]
        dst_addr = struct.unpack("!I", ip.dst)[0]
        src_ip = socket.inet_ntoa(ip.src)
        dst_ip = socket.inet_ntoa(ip.dst)
        pkt_length = len(buf)
        all_fields.append([src_addr, dst_addr, src_ip, dst_ip, src_port, dst_port, 
                           protocol, proto_code, pkt_length, ts,
                           tos, id, ttl, chksum, flags, tcp_window, tcp_dataoffset, udp_length,
                           payload])
    if save_path is None:
        try:
            os.makedirs(os.path.dirname(filename), exist_ok=True)
        except:
            pass
    sp = save_path if save_path is not None else filename.replace(suffix, ".csv")
    pd.DataFrame(all_fields).to_csv(sp, sep=","
                , header=["src_addr", "dst_addr", "src_ip", "dst_ip", "src_port", "dst_port", 
                        "protocol", "proto_code", "pkt_length", "timestamp",
                        "tos", "id", "ttl", "chksum", "flags", "tcp_window", "tcp_dataoffset", "udp_length",
                        "payload"], index=False)

In [13]:
import os
from utils import pcap2csv_by_dpkt

filename = os.path.join("train_set", "benign5.pcapng")
pcap2csv_by_dpkt(filename, pcapng=True)
# pcap2csv_by_scapy(filename, pcapng=True)

udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17


In [16]:
import dpkt

filename = os.path.join("dataset_lite", "SQL_Injection.pcap")
fpcap = dpkt.pcap.Reader(open(filename, "rb"))
for ts, buf in fpcap:
    eth = dpkt.ethernet.Ethernet(buf)
    if eth.type != dpkt.ethernet.ETH_TYPE_IP:
        continue
    ip = eth.data
    if ip.p == dpkt.ip.IP_PROTO_TCP:
        tcp = ip.data
        print(tcp.data)

b''
b''
b''
b'GET /DVWA/vulnerabilities/sqli/?id=3+%27&Submit=Submit HTTP/1.1\r\nHost: 18.218.83.150\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-US,en;q=0.5\r\nAccept-Encoding: gzip, deflate\r\nReferer: http://18.218.83.150/DVWA/vulnerabilities/sqli/\r\nCookie: security=low; PHPSESSID=1ahkodmftscc0arub5fbqie6d0\r\nConnection: keep-alive\r\nUpgrade-Insecure-Requests: 1\r\n\r\n'
b''
b"HTTP/1.1 200 OK\r\nDate: Thu, 22 Feb 2018 20:16:30 GMT\r\nServer: Apache/2.4.18 (Ubuntu)\r\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\r\nCache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0\r\nPragma: no-cache\r\nVary: Accept-Encoding\r\nContent-Encoding: gzip\r\nContent-Length: 140\r\nKeep-Alive: timeout=5, max=100\r\nConnection: Keep-Alive\r\nContent-Type: text/html; charset=UTF-8\r\n\r\n\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03-\x8e1\x0e\x021\

In [17]:
from scapy.all import IP, TCP, UDP, PcapReader, PcapNgReader

filename = os.path.join("dataset_lite", "SQL_Injection.pcap")
fpcap = PcapReader(filename)
for pkt in fpcap:
        if pkt.haslayer(IP):
            if pkt.haslayer(TCP):
                tcp = pkt[TCP]
                print(bytes(tcp.payload))

b''
b''
b''
b'GET /DVWA/vulnerabilities/sqli/?id=3+%27&Submit=Submit HTTP/1.1\r\nHost: 18.218.83.150\r\nUser-Agent: Mozilla/5.0 (Windows NT 10.0; WOW64; rv:58.0) Gecko/20100101 Firefox/58.0\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\nAccept-Language: en-US,en;q=0.5\r\nAccept-Encoding: gzip, deflate\r\nReferer: http://18.218.83.150/DVWA/vulnerabilities/sqli/\r\nCookie: security=low; PHPSESSID=1ahkodmftscc0arub5fbqie6d0\r\nConnection: keep-alive\r\nUpgrade-Insecure-Requests: 1\r\n\r\n'
b''
b"HTTP/1.1 200 OK\r\nDate: Thu, 22 Feb 2018 20:16:30 GMT\r\nServer: Apache/2.4.18 (Ubuntu)\r\nExpires: Thu, 19 Nov 1981 08:52:00 GMT\r\nCache-Control: no-store, no-cache, must-revalidate, post-check=0, pre-check=0\r\nPragma: no-cache\r\nVary: Accept-Encoding\r\nContent-Encoding: gzip\r\nContent-Length: 140\r\nKeep-Alive: timeout=5, max=100\r\nConnection: Keep-Alive\r\nContent-Type: text/html; charset=UTF-8\r\n\r\n\x1f\x8b\x08\x00\x00\x00\x00\x00\x00\x03-\x8e1\x0e\x021\

In [2]:
from utils import pcap2csv_by_dpkt
import os

filename = "Friday-WorkingHours.pcapng"
pcap2csv_by_dpkt(filename, pcapng=True)

udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance of dpkt.udp.UDP
protocol: 17
udp is not instance 

In [1]:
import pandas as pd

csv_file = "Friday-WorkingHours.csv"
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,src_addr,dst_addr,src_ip,dst_ip,src_port,dst_port,protocol,proto_code,pkt_length,timestamp,tos,id,ttl,chksum,flags,payload
0,3232238130,3232238083,192.168.10.50,192.168.10.3,56108.0,3268.0,6,1000,469,1499429000.0,0,50432,64,57002,16384,b'\x00\x00\x01\x8f`\x82\x01\x8b\x06\t*\x86H\x8...
1,3232238130,3232238083,192.168.10.50,192.168.10.3,56108.0,3268.0,6,1000,469,1499429000.0,0,50432,64,57002,16384,b'\x00\x00\x01\x8f`\x82\x01\x8b\x06\t*\x86H\x8...
2,3232238083,3232238130,192.168.10.3,192.168.10.50,3268.0,56108.0,6,1000,138,1499429000.0,0,24566,128,1280,16384,b'\x00\x00\x00D`B\x06\t*\x86H\x86\xf7\x12\x01\...
3,3232238083,3232238130,192.168.10.3,192.168.10.50,3268.0,56108.0,6,1000,138,1499429000.0,0,24566,128,1280,16384,b'\x00\x00\x00D`B\x06\t*\x86H\x86\xf7\x12\x01\...
4,3232238130,3232238083,192.168.10.50,192.168.10.3,56108.0,3268.0,6,1000,66,1499429000.0,0,50433,64,57404,16384,b''


In [2]:
print(df.shape)

(9915484, 16)


In [7]:
import os
labled_df = pd.read_csv(os.path.join("data", "Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv"))
label_dict = {}
for i, row in labled_df.iterrows():
    flow_id = f"{row[' Source IP']}-{row[' Destination IP']}-{row[' Source Port']}-{row[' Destination Port']}-{row[' Protocol']}"
    label_dict[row["Flow ID"]] = row[" Label"]

import json
with open(os.path.join("data", "Friday.json"), "w") as f:
    json.dump(label_dict, f)

In [8]:
# df_small = df.sample(frac=0.001, random_state=42)
df_small = df[:10000]

for i, row in df_small.iterrows():
    flow_id = f"{row['src_ip']}-{row['dst_ip']}-{row['src_port']}-{row['dst_port']}-{row['protocol']}"
    if flow_id in label_dict:
        df_small.loc[i, "label"] = label_dict[flow_id]
    else:
        df_small.loc[i, "label"] = "UNKNOWN"

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_small.loc[i, "label"] = "UNKNOWN"


In [10]:
df_small.to_csv("Friday-WorkingHours-small.csv", index=False)