In [None]:
#!pip install scapy
#!pip install pandas
#!pip install matplotlib
#!pip install networkx
#!pip install polars
#!pip install pyarrow

In [None]:
from scapy.all import * # Packet manipulation
import polars as pl # Pandas - Create and Manipulate DataFrames
from datetime import datetime # Datetime - Convert Epoch to Datetime
import ipaddress # IPAddress - Check for multicast and broadcast addresses
import time # Measure time it takes to run
import csv # CSV - Write to CSV
import pyarrow.parquet as pq # PyArrow - Write to Parquet
import pyarrow.csv as pv # PyArrow - Read CSV
import re # Regex for name generation of files

#import networkx as nx # NetworkX - Create and Manipulate Graphs


In [None]:
pcap_name = "The Ultimate PCAP v20221220.pcapng"
csv_file_name = re.sub(r'[^\w\s]', '', pcap_name).replace(" ", "_") + ".csv"
parquet_file_name = re.sub(r'[^\w\s]', '', pcap_name).replace(" ", "_") + ".parquet"

In [None]:
protocol_to_numbers = {'hopopt': 0, 'icmp': 1, 'igmp': 2, 'ggp': 3, 'ipv4': 4, 'st': 5, 'tcp': 6, 'cbt': 7, 
    'egp': 8, 'igp': 9, 'bbn-rcc-mon': 10, 'nvp-ii': 11, 'pup': 12, 'emcon': 14, 'xnet': 15, 'chaos': 16, 'udp': 17, 'mux': 18,
    'dcn-meas': 19, 'hmp': 20, 'prm': 21, 'xns-idp': 22, 'trunk-1': 23, 'trunk-2': 24, 'leaf-1': 25, 'leaf-2': 26, 'rdp': 27, 
    'irtp': 28, 'iso-tp4': 29, 'netblt': 30, 'mfe-nsp': 31, 'merit-inp': 32, 'dccp': 33, '3pc': 34, 'idpr': 35, 'xtp': 36, 
    'ddp': 37, 'idpr-cmtp': 38, 'tp++': 39, 'il': 40, 'ipv6': 41, 'sdrp': 42, 'ipv6-route': 43, 'ipv6-frag': 44, 'idrp': 45,
    'rsvp': 46, 'gre': 47, 'dsr': 48, 'bna': 49, 'esp': 50, 'ah': 51, 'i-nlsp': 52, 'narp': 54, 'mobile': 55, 'tlsp': 56, 
    'skip': 57, 'ipv6-icmp': 58, 'ipv6-nonxt': 59, 'ipv6-opts': 60, 'cftp': 62, 'sat-expak': 64, 'kryptolan': 65, 'rvd': 66,
    'ippc': 67, 'sat-mon': 69, 'visa': 70, 'ipcv': 71, 'cpnx': 72, 'cphb': 73, 'wsn': 74, 'pvp': 75, 'br-sat-mon': 76, 
    'sun-nd': 77, 'wb-mon': 78, 'wb-expak': 79, 'iso-ip': 80, 'vmtp': 81, 'secure-vmtp': 82, 'vines': 83, 'ttp': 84,
    'iptm': 84, 'nsfnet-igp': 85, 'dgp': 86, 'tcf': 87, 'eigrp': 88, 'ospfigp': 89, 'sprite-rpc': 90,
    'larp': 91, 'mtp': 92, 'ax.25': 93, 'ipip': 94, 'scc-sp': 96, 'etherip': 97, 'encap': 98, 'gmtp': 100, 
    'ifmp': 101, 'pnni': 102, 'pim': 103, 'aris': 104, 'scps': 105, 'qnx': 106, 'a/n': 107, 'ipcomp': 108, 
    'snp': 109, 'compaq-peer': 110, 'ipx-in-ip': 111, 'vrrp': 112, 'pgm': 113, 'l2tp': 115, 'ddx': 116, 'iatp': 117, 
    'stp': 118, 'srp': 119, 'uti': 120, 'smp': 121, 'ptp': 123, 'fire': 125, 'crtp': 126, 'crudp': 127, 'sscopmce': 128, 
    'iplt': 129, 'sps': 130, 'pipe': 131, 'sctp': 132, 'fc': 133, 'rsvp-e2e-ignore': 134, 'udplite': 136, 'mpls-in-ip': 137,
      'manet': 138, 'hip': 139, 'shim6': 140, 'wesp': 141, 'rohc': 142, 'ethernet': 143, 'aggfrag': 144, 'rsvp-e2e': 145}

# https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml

def get_protocol_name(protocol_number):
    for protocol_name, number in protocol_to_numbers.items():
        if number == protocol_number:
            return protocol_name
    return "Unknown"

In [None]:
class PCAPToDataFrame:
    def __init__(self):
        self.capture_file = pcap_name
        self.parquet_file = parquet_file_name
        
        # Create new CSV file and remove old one
        self.csv_file_name = csv_file_name
        self.headers = ["time","src_ip","src_mac","dst_ip","dst_mac","protocol","payload_size","multicast","private_to_private","dst_broadcast","src_port","dst_port"]
        if os.path.exists(self.csv_file_name):
            os.remove(self.csv_file_name)
        with open(self.csv_file_name, 'w') as f:
            writer = csv.DictWriter(f, fieldnames=self.headers)
            writer.writeheader()
        f.close()

    # Check if ip is multicast and private>private and broadcast
    def check_multicast_and_private(self, packet, data):
        src_ip = ipaddress.ip_address(packet[IP].src)
        dst_ip = ipaddress.ip_address(packet[IP].dst)
        if src_ip.is_multicast or dst_ip.is_multicast:
            data["multicast"] = True
        else:
            data["multicast"] = False

        if src_ip.is_private and dst_ip.is_private:
            data["private_to_private"] = True
        else:
            data["private_to_private"] = False

        if not dst_ip.is_global and dst_ip.is_link_local:
            data["dst_broadcast"] = True
        else:
            data["dst_broadcast"] = False

    # Extract the port numbers
    def extract_port_numbers(self, packet, data):
        if packet.haslayer(TCP):
            data["src_port"] = int(packet[TCP].sport)
            data["dst_port"] = int(packet[TCP].dport)
        elif packet.haslayer(UDP):
            data["src_port"] = int(packet[UDP].sport)
            data["dst_port"] = int(packet[UDP].dport)

    # Extract the conversations from the packet
    def conversations_extract(self, packet):

        # Check if the packet has the IP and Ethernet layers
        if not packet.haslayer(IP) or not packet.haslayer(Ether):
            return
        
        # Convert the timestamp to a readable UTC time
        time = datetime.utcfromtimestamp(int(packet.time)).strftime('%Y-%m-%d %H:%M:%S')
        protocol = get_protocol_name(int(packet[IP].proto))

        # Extract the desired data from the packet
        data = {
            "time": time,
            "src_ip": packet[IP].src,
            "src_mac": packet[Ether].src,
            "dst_ip": packet[IP].dst,
            "dst_mac": packet[Ether].dst,
            "protocol": protocol,
            "payload_size": len(packet[IP].payload)
        }

        # Extract
        self.check_multicast_and_private(packet, data)
        self.extract_port_numbers(packet, data)

        # Write the row to a csv file
        self.writer.writerow(data)

    def read_pcap_to_dataframe(self):
        def conversations_extract_wrapper(packet):
            self.conversations_extract(packet)

        # Start time to read pcap time
        start_time = time.time()

        # Read the PCAP file and extract the data
        with open(self.csv_file_name, 'a') as f:
            self.writer = csv.DictWriter(f, fieldnames=self.headers)        
            packets = sniff(offline=self.capture_file, prn=conversations_extract_wrapper, store=0)
        f.close()

        # Remove any previous parquet file
        if os.path.exists(self.parquet_file):
            os.remove(self.parquet_file)

        # Convert the CSV file to a Parquet file
        table = pv.read_csv(self.csv_file_name)
        pq.write_table(table, self.parquet_file, compression='snappy')

        # Remove the CSV file
        if os.path.exists(self.csv_file_name):
            os.remove(self.csv_file_name)
        '''
        File size differences using test 12mb pcap file:
        12M  | pcapng
        2.6M | csv
        100K | parquet
        '''
        # Record time taken to process the PCAP file
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f'Elapsed time: {elapsed_time} seconds')

In [None]:
# Create an instance of the PCAPToDataFrame class
pcap_to_df = PCAPToDataFrame()

# Read the PCAP file and create a csv
pcap_to_df.read_pcap_to_dataframe()

In [None]:
df = pl.read_parquet(parquet_file_name)
df.head(5)