In [None]:
#!pip install scapy
#!pip install pandas
#!pip install networkx
#!pip install polars
#!pip install aspose-diagram-python 
#!pip install matplotlib
#!pip install spacy
#!python -m spacy download en_core_web_sm

In [None]:
from scapy.all import * # Packet manipulation
import polars as pl # Pandas - Create and Manipulate DataFrames
from datetime import datetime # Datetime - Convert Epoch to Datetime
import ipaddress # IPAddress - Check for multicast and broadcast addresses
import time # Measure time it takes to run
import csv # CSV - Write to CSV
import re # Regex for name generation of files
import queue # Queue - Used for threading
from threading import Thread # Threading - Run multiple functions at once
import networkx as nx # NetworkX - Create and Manipulate Graphs
import os # OS - Check if file exists
import matplotlib.pyplot as plt # Matplotlib - Plot Graphs
import numpy as np # Numpy - Create Arrays
import spacy # Spacy - NLP
#import aspose.diagram
#from aspose.diagram import *

In [None]:
pcap_name = "The Ultimate PCAP v20221220.pcapng"
csv_file_name = re.sub(r'[^\w\s]', '', pcap_name).replace(" ", "_") + ".csv"
parquet_file_name = re.sub(r'[^\w\s]', '', pcap_name).replace(" ", "_") + ".parquet"

In [None]:
protocol_to_numbers = {'hopopt': 0, 'icmp': 1, 'igmp': 2, 'ggp': 3, 'ipv4': 4, 'st': 5, 'tcp': 6, 'cbt': 7, 
    'egp': 8, 'igp': 9, 'bbn-rcc-mon': 10, 'nvp-ii': 11, 'pup': 12, 'emcon': 14, 'xnet': 15, 'chaos': 16, 'udp': 17, 'mux': 18,
    'dcn-meas': 19, 'hmp': 20, 'prm': 21, 'xns-idp': 22, 'trunk-1': 23, 'trunk-2': 24, 'leaf-1': 25, 'leaf-2': 26, 'rdp': 27, 
    'irtp': 28, 'iso-tp4': 29, 'netblt': 30, 'mfe-nsp': 31, 'merit-inp': 32, 'dccp': 33, '3pc': 34, 'idpr': 35, 'xtp': 36, 
    'ddp': 37, 'idpr-cmtp': 38, 'tp++': 39, 'il': 40, 'ipv6': 41, 'sdrp': 42, 'ipv6-route': 43, 'ipv6-frag': 44, 'idrp': 45,
    'rsvp': 46, 'gre': 47, 'dsr': 48, 'bna': 49, 'esp': 50, 'ah': 51, 'i-nlsp': 52, 'narp': 54, 'mobile': 55, 'tlsp': 56, 
    'skip': 57, 'ipv6-icmp': 58, 'ipv6-nonxt': 59, 'ipv6-opts': 60, 'cftp': 62, 'sat-expak': 64, 'kryptolan': 65, 'rvd': 66,
    'ippc': 67, 'sat-mon': 69, 'visa': 70, 'ipcv': 71, 'cpnx': 72, 'cphb': 73, 'wsn': 74, 'pvp': 75, 'br-sat-mon': 76, 
    'sun-nd': 77, 'wb-mon': 78, 'wb-expak': 79, 'iso-ip': 80, 'vmtp': 81, 'secure-vmtp': 82, 'vines': 83, 'ttp': 84,
    'iptm': 84, 'nsfnet-igp': 85, 'dgp': 86, 'tcf': 87, 'eigrp': 88, 'ospfigp': 89, 'sprite-rpc': 90,
    'larp': 91, 'mtp': 92, 'ax.25': 93, 'ipip': 94, 'scc-sp': 96, 'etherip': 97, 'encap': 98, 'gmtp': 100, 
    'ifmp': 101, 'pnni': 102, 'pim': 103, 'aris': 104, 'scps': 105, 'qnx': 106, 'a/n': 107, 'ipcomp': 108, 
    'snp': 109, 'compaq-peer': 110, 'ipx-in-ip': 111, 'vrrp': 112, 'pgm': 113, 'l2tp': 115, 'ddx': 116, 'iatp': 117, 
    'stp': 118, 'srp': 119, 'uti': 120, 'smp': 121, 'ptp': 123, 'fire': 125, 'crtp': 126, 'crudp': 127, 'sscopmce': 128, 
    'iplt': 129, 'sps': 130, 'pipe': 131, 'sctp': 132, 'fc': 133, 'rsvp-e2e-ignore': 134, 'udplite': 136, 'mpls-in-ip': 137,
      'manet': 138, 'hip': 139, 'shim6': 140, 'wesp': 141, 'rohc': 142, 'ethernet': 143, 'aggfrag': 144, 'rsvp-e2e': 145}

# https://www.iana.org/assignments/protocol-numbers/protocol-numbers.xhtml

def get_protocol_name(protocol_number):
    for protocol_name, number in protocol_to_numbers.items():
        if number == protocol_number:
            return protocol_name
    return protocol_number

In [None]:
class PCAPToDataFrame:
    def __init__(self, pcap_name, parquet_file_name, csv_file_name):
        self.capture_file = pcap_name
        self.parquet_file = parquet_file_name
        
        # Create new CSV file and remove old one
        self.csv_file_name = csv_file_name
        self.headers = ["time","src_ip","src_mac","dst_ip","dst_mac","protocol","payload_size",
                        "multicast","private_to_private","dst_broadcast","src_port","dst_port",
                        "localhost"]
        if os.path.exists(self.csv_file_name):
            os.remove(self.csv_file_name)

        self.queue = []
        self.batch_size = 5000  # Number of packets to buffer before writing to file

    def write_to_csv(self):
        with open(self.csv_file_name, 'a') as f:
            writer = csv.DictWriter(f, fieldnames=self.headers)
            writer.writeheader()
            while self.queue:
                data = self.queue.pop(0)
                if data is None:
                    break
                writer.writerow(data)

    # Check if ip is multicast and private>private and broadcast
    def check_multicast_and_private(self, packet, data):
        src_ip = ipaddress.ip_address(packet[IP].src)
        dst_ip = ipaddress.ip_address(packet[IP].dst)

        data["multicast"] = src_ip.is_multicast or dst_ip.is_multicast
        data["private_to_private"] = src_ip.is_private and dst_ip.is_private
        data["localhost"] = src_ip.is_loopback or dst_ip.is_loopback
        data["dst_broadcast"] = src_ip.is_global or dst_ip.is_global

    # Extract the port numbers
    def extract_port_numbers(self, packet, data):
        if packet.haslayer(TCP):
            data["src_port"] = int(packet[TCP].sport)
            data["dst_port"] = int(packet[TCP].dport)
        elif packet.haslayer(UDP):
            data["src_port"] = int(packet[UDP].sport)
            data["dst_port"] = int(packet[UDP].dport)

    # Extract the conversations from the packet
    def conversations_extract(self, packet):

        # Check if the packet has the IP and Ethernet layers
        if not packet.haslayer(IP) or not packet.haslayer(Ether):
            return
        
        # Convert the timestamp to a readable UTC time
        #time = datetime.utcfromtimestamp(int(packet.time)).strftime('%Y-%m-%d %H:%M:%S')
        time = datetime.utcfromtimestamp(int(packet.time)).isoformat()
        protocol = get_protocol_name(int(packet[IP].proto))

        # Extract the desired data from the packet
        data = {
            "time": time,
            "src_ip": packet[IP].src,
            "src_mac": packet[Ether].src,
            "dst_ip": packet[IP].dst,
            "dst_mac": packet[Ether].dst,
            "protocol": protocol,
            "payload_size": len(packet[IP].payload)
        }

        # Extract
        self.check_multicast_and_private(packet, data)
        self.extract_port_numbers(packet, data)

        # Add the data to the queue
        self.queue.append(data)

        # If the queue is full, write the data to the CSV file
        if len(self.queue) >= self.batch_size:
            self.write_to_csv()

    def read_pcap_to_dataframe(self):
        # Start time to read pcap time
        start_time = time.time()

        # Read the PCAP file and extract the data from each packet using the conversations_extract function
        # Use a queue to write the data to a CSV file      
        for packet in PcapReader(self.capture_file):
            self.conversations_extract(packet)
        self.write_to_csv()  # Write any remaining data to the file

        # Remove any previous parquet file
        if os.path.exists(self.parquet_file):
            os.remove(self.parquet_file)

        # Open the CSV file and convert it to a Polars dataframe
        conversations_df = pl.read_csv(self.csv_file_name, ignore_errors=True, parse_dates=True, infer_schema_length=10)
        conversations_df.write_parquet(self.parquet_file, compression='brotli')

        # Remove the CSV file
        if os.path.exists(self.csv_file_name):
            os.remove(self.csv_file_name)
        '''
        File size differences using test 12mb pcap file:
        12M  | pcapng
        2.6M | csv
        65K | parquet
        '''
        # Record time taken to process the PCAP file
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f'Elapsed time: {elapsed_time} seconds')

In [None]:
# Create an instance of the PCAPToDataFrame class
pcap_to_df = PCAPToDataFrame(pcap_name, parquet_file_name, csv_file_name)

# Read the PCAP file and create a csv
pcap_to_df.read_pcap_to_dataframe()

In [None]:
df = pl.read_parquet(parquet_file_name)
df.head(5)

In [None]:
filtered_df = df.filter((pl.col("dst_ip") != "255.255.255.255") & (pl.col("dst_ip") != "0.0.0.0"))
filtered_df = filtered_df.filter((pl.col("localhost") == False))
filtered_df = filtered_df.filter((pl.col("private_to_private") == True) & (pl.col("multicast") == False))
filtered_df = filtered_df.drop(["time", "protocol", "payload_size", "multicast", "private_to_private"])
filtered_df = filtered_df.unique(subset=["src_ip","dst_ip"])
filtered_df.head(25)

In [None]:
ip_mac_dict = {}
for row in filtered_df.iterrows(named=True):
    ip_mac_dict[row.src_ip] = row.src_mac

# Create a graph from the dataframe
G = nx.Graph() 

# From the filtered_df dataframe, create an edge with src_ip and dst_ip with src_ip as label
for row in filtered_df.iterrows(named=True):
    G.add_node(row.src_ip, label=row.src_ip, nodeid=ip_mac_dict[row.src_ip])
    G.add_node(row.dst_ip, label=row.dst_ip)
    G.add_edge(row.src_ip, row.dst_ip, label=row.src_ip)
    
nx.draw_planar(G, with_labels = True) 


In [None]:
'''
# Create a new Visio document
visio_doc = Diagram()

# TODO: Add stencil file
# Load stencil file
diagram = Diagram("Basic-Shapes.vss")

# Add shapes to the document
for node in G.nodes():
    shape = visio_doc.add_shape(4.25, 5.5, 2, 1, "Rectangle", 0)
    shape.text.value = node

# Create connections between shapes
for edge in G.edges():
    visio_doc.add_connector(edge[0], edge[1])

# Save the Visio document
visio_doc.save("network_diagram.vsdx")
'''

In [None]:
# Modified from https://raw.githubusercontent.com/chmduquesne/python-drawio/main/drawio/drawio.py

def max_degree(g):
    """
    Returns the maximum number of edges of any node in the graph
    """
    return max([g.degree[n] for n in g])



def edge_styles(g):
    """
    Returns the list of all possible edge styles
    """
    styles = set()
    for e in g.edges(data="style"):
        style = "-"
        if e[2] is not None:
            style = e[2]
        styles.add(style)
    return sorted(list(styles))



def write_header(g, f):
    """
    Creates a header for the csv file of the graph
    """
    n = max_degree(g)
    es = edge_styles(g)

    f.write("# identity: nodeid\n")
    f.write("# label: %label%\n")
    f.write("# style: %style%\n")
    f.write("# link: url\n")
    f.write("# width: @width\n")
    f.write("# height: @height\n")
    f.write("# layout: verticalflow\n")

    refs = [f"ref_{i}_{j}" for j in range(len(es)) for i in range(n)]
    labels = [f"label_{i}" for i in range(n)]

    f.write("# ignore: nodeid,style,height,width," + ",".join(refs +
        labels) + "\n" )

    # ref_i_j is connected to nodeid with label i and edge style j
    for j, s in enumerate(es):
        for i in range(n):
            f.write(
                f'# connect: {{"from": "ref_{i}_{j}", "to": "nodeid", '
                f'"fromlabel": "label_{i}", '
                f'"style": "{s}"}}\n')
    f.write(','.join(
        ["nodeid", "label", "tags", "style", "width", "height", "link"] +
        refs + labels
        ) + "\n" )

def write_graph(g, f, ip_mac_dict):
    """
    Creates the content for the csv file of the graph
    """
    n = max_degree(g)
    es = edge_styles(g)

    for node in g.nodes:
        label = g.nodes[node].get("label", "-")
        tags = g.nodes[node].get("tags", "-")
        style = g.nodes[node].get("style", "-")
        link = g.nodes[node].get("link", "-")
        width = g.nodes[node].get("width", "auto")
        height = g.nodes[node].get("height", "auto")

        # ref_i_j is connected to nodeid with label i and edge style j
        refs = ["-"] * n * len(es)
        labels = ["-"] * n
        for i, e in enumerate(g.edges(node, data=True)):
            data = e[2]
            j = es.index(data.get("style", "-"))
            refs[j*n+i] = f"{e[1]}"
            labels[i] = data.get("label", "-")

        # Add the nodeid using the ip_mac_dict
        nodeid = ip_mac_dict.get(node, "-")

        f.write(','.join([f"{node}", label, tags, style, width, height, link, nodeid] + refs + labels) + "\n")

def write(g, f):
    write_header(g, f)
    write_graph(g, sys.stdout, ip_mac_dict)

In [None]:
# Draw.io CSV format for input
write(G, sys.stdout)

In [None]:
#TODO: Create a csv file
class parse_http:
    def __init__(self, pcap_file):
        self.pcap_file = pcap_file
        self.nlp = spacy.load("en_core_web_sm")

    def extract_sensitive_info(self, url):
        # Extract key-value pairs from URL
        match = re.search(r'([^?=&]+)(=([^&]*))?', url)
        if match:
            pairs = match.groups()
            sensitive_info = []
            for i in range(0, len(pairs), 3):
                key = pairs[i]
                value = pairs[i+2] if pairs[i+2] else ""
                doc = self.nlp(value)
                for ent in doc.ents:
                    sensitive_info.append((key, ent.text))
            return sensitive_info
        else:
            return None

    def extract_info(self):
        # Read in pcap file using Scapy
        packets = rdpcap(self.pcap_file)
        data = []
        # Iterate through packets and extract relevant information
        for packet in packets:
            if 'HTTP' in packet:
                src_ip = packet[IP].src
                dst_ip = packet[IP].dst
                if packet.haslayer(TCP):
                    src_port = packet[TCP].sport
                    dst_port = packet[TCP].dport
                    payload = packet[TCP].payload
                elif packet.haslayer(UDP):
                    src_port = packet[UDP].sport
                    dst_port = packet[UDP].dport
                    payload = packet[UDP].payload
                if payload:
                    payload = payload.decode()
                # Extract URLs, referers, and user agents
                urls = re.findall(r'(https?:\/\/[^\s]+)', payload)
                referers = re.findall(r'Referer: (.*)', payload)
                user_agents = re.findall(r'User-Agent: (.*)', payload)
                # Extract sensitive information from URLs
                sensitive_info = self.extract_sensitive_info(urls[0]) if len(urls) > 0 else None
                data.append([src_ip, dst_ip, src_port, dst_port, urls, referers, user_agents, sensitive_info]) 
        # Create DataFrame from extracted data
        df = pl.DataFrame(data, columns=['src_ip', 'dst_ip', 'src_port', 'dst_port', 'urls', 'referers', 'user_agents', 'sensitive_info'])

        return df

In [None]:
http_parser = parse_http(pcap_name)
http_df = http_parser.extract_info()

In [None]:
http_df.head(10)

In [None]:
# Plot the number of packets per protocol
q = (
    df.lazy()
    .groupby("protocol")
    .agg(pl.count())
    .sort("count", reverse=True)
    .limit(5)
)

df_proto_top = q.collect()
x = df_proto_top["protocol"]
y = df_proto_top["count"]
plt.bar(x, y)
plt.xlabel('Protocol')
plt.ylabel('Count')
plt.show()

In [None]:
q = (
    df.lazy()
    .groupby(["src_ip","dst_ip"])
    .agg(pl.count())
    .sort("count", reverse=True)
    .limit(10)
)

df_top_talkers = q.collect()
x = df_top_talkers["src_ip"] #or df["dst_ip"]
y = df_top_talkers["count"]

plt.bar(x, y)
plt.xlabel('IP Address')
plt.ylabel('Count')
plt.xticks(rotation = 90) # this will rotate the x-axis labels
plt.show()