In [None]:
import os
import random
import matplotlib.pyplot as plt
from multiprocessing import Pool
from scapy.all import rdpcap

def analyze_limited_inter_packet_timing(pcap_file, max_packets=10000):
    try:
        packets = rdpcap(pcap_file)[:max_packets]  # Limit to max_packets
        timestamps = [pkt.time for pkt in packets]
        inter_arrival_times = [t2 - t1 for t1, t2 in zip(timestamps[:-1], timestamps[1:])]
        return inter_arrival_times
    except Exception as e:
        print(f"Error processing {pcap_file}: {e}")
        return []

def combine_limited_inter_arrival_times(pcap_dir, max_packets=10000, max_files=None):
    all_inter_arrival_times = []
    pcap_files = [f for f in os.listdir(pcap_dir) if f.endswith('.pcapng')]

    # Balance the number of files if max_files is specified
    if max_files is not None and len(pcap_files) > max_files:
        pcap_files = random.sample(pcap_files, max_files)

    for file in pcap_files:
        file_path = os.path.join(pcap_dir, file)
        print(f"Processing {file_path}...")
        inter_arrival_times = analyze_limited_inter_packet_timing(file_path, max_packets=max_packets)
        all_inter_arrival_times.extend(inter_arrival_times)

    return all_inter_arrival_times

def process_file_parallel(args):
    file_path, max_packets = args
    return analyze_limited_inter_packet_timing(file_path, max_packets=max_packets)

def combine_parallel(pcap_dir, max_packets=10000, max_files=None, num_workers=4):
    pcap_files = [f for f in os.listdir(pcap_dir) if f.endswith('.pcapng')]

    # Balance the number of files if max_files is specified
    if max_files is not None and len(pcap_files) > max_files:
        pcap_files = random.sample(pcap_files, max_files)

    file_paths = [os.path.join(pcap_dir, f) for f in pcap_files]
    with Pool(processes=num_workers) as pool:
        results = pool.map(process_file_parallel, [(fp, max_packets) for fp in file_paths])
    all_inter_arrival_times = [item for sublist in results for item in sublist]
    return all_inter_arrival_times

# Main function
def main():
    streaming_dir = '/tank/swlarsen/2024_11_07/fcc-ht2'
    transfer_dir = '/tank/swlarsen/2024_10_18/fcc-ht2/run1'
    max_packets = 10000

    # Get the number of files in the streaming directory to balance datasets
    streaming_files = [f for f in os.listdir(streaming_dir) if f.endswith('.pcapng')]
    max_files = len(streaming_files)

    # Process streaming traffic
    streaming_inter_times = combine_parallel(streaming_dir, max_packets=max_packets, max_files=max_files, num_workers=4)

    # Process file transfer traffic
    transfer_inter_times = combine_parallel(transfer_dir, max_packets=max_packets, max_files=max_files, num_workers=4)

    # Plot the results
    plt.figure(figsize=(10, 6))
    plt.hist(streaming_inter_times, bins=50, alpha=0.7, label="Streaming", color='blue', density=True)
    plt.hist(transfer_inter_times, bins=50, alpha=0.7, label="File Transfer", color='orange', density=True)
    plt.xlabel("Inter-Packet Time (seconds)")
    plt.ylabel("Density")
    plt.title("Normalized Inter-Arrival Times for Streaming vs File Transfer")
    plt.legend()
    plt.show()

if __name__ == "__main__":
    main()
