In [21]:
import pyshark
import pandas as pd
import threading
import requests

def packet_to_dict(pkt):
    packet_dict = {
        'time': pkt.sniff_time.timestamp(),
        'frame_number': pkt.frame_info.number,
        'protocol': pkt.highest_layer,
        'source': pkt.ip.src if hasattr(pkt, 'ip') else None,
        'destination': pkt.ip.dst if hasattr(pkt, 'ip') else None,
        'length': int(pkt.length),
        'ttl': int(pkt.ip.ttl) if hasattr(pkt, 'ip') else None,
        'source_port': int(pkt[pkt.transport_layer].srcport) if pkt.transport_layer else None,
        'destination_port': int(pkt[pkt.transport_layer].dstport) if pkt.transport_layer else None,
        'tcp_flags': int(pkt.tcp.flags, 16) if hasattr(pkt, 'tcp') else None,
        'dns_query': pkt.dns.qry_name if hasattr(pkt, 'dns') and hasattr(pkt.dns, 'qry_name') else None,
        'dns_response': [resp.addr for resp in pkt.dns.resp_addr.all_fields] if hasattr(pkt, 'dns') and hasattr(pkt.dns, 'resp_addr') else None,
        'http_request_method': pkt.http.request_method if hasattr(pkt, 'http') and hasattr(pkt.http, 'request_method') else None,
        'http_host': pkt.http.host if hasattr(pkt, 'http') and hasattr(pkt.http, 'host') else None,
        'http_user_agent': pkt.http.user_agent if hasattr(pkt, 'http') and hasattr(pkt.http, 'user_agent') else None,
        'ssl_version': pkt.ssl.version if hasattr(pkt, 'ssl') and hasattr(pkt.ssl, 'version') else None,
        'tls_handshake_type': pkt.tls.handshake_type if hasattr(pkt, 'tls') and hasattr(pkt.tls, 'handshake_type') else None,
        # Add more fields as needed
    }
    return packet_dict

def packet_callback(pkt, packets_df):
    if isinstance(packets_df, pd.DataFrame):
        packet_dict = packet_to_dict(pkt)
        return pd.concat([packets_df, pd.DataFrame([packet_dict])], ignore_index=True)
    else:
        print("Error: packets_df is not a DataFrame.")
        return None

import requests

def capture_packets(website):
    packets_df = pd.DataFrame(columns=['time', 'frame_number', 'protocol', 'source', 'destination', 
                                        'length', 'ttl', 'source_port', 'destination_port', 'tcp_flags'])
    # Start capturing packets
    capture = pyshark.LiveCapture(interface='en0', display_filter='tcp or dns or http or ssl or tls')
    capture.sniff(timeout=10)

    # Start a new thread for making the request
    def make_request():
        for _ in range(10):
            try:
                response = requests.get(website)
                print(f"Status code for {website}: {response.status_code}")
            except requests.exceptions.ConnectionError:
                print(f"Error: Failed to establish a new connection to {website}")

    request_thread = threading.Thread(target=make_request)
    request_thread.start()

    # Iterate over captured packets and store them in DataFrame
    for packet in capture.sniff_continuously(packet_count=10):
        packets_df = packet_callback(packet, packets_df)
        if packets_df is None:
            break  # Stop capturing if DataFrame becomes invalid

    # Append DataFrame to CSV
    with open(f'captures/{website.replace("https://", "").replace("/", "_")}.csv', 'a') as f:
        packets_df.to_csv(f, header=f.tell()==0, index=False)

if __name__ == "__main__":
    # List of websites to visit
    websites = ["https://www.washingtonpost.com", "https://www.ndtv.com", "https://www.cnbc.com", 
                "https://www.timesofindia.com", "https://www.express.co.uk", "https://www.cnn.com", 
                "https://www.news18.com", "https://www.nypost.com", "https://www.abc.net.au"]

    # Create a separate thread for packet capture for each website
    for website in websites:
        capture_thread = threading.Thread(target=capture_packets, args=(website,))
        capture_thread.start()
        capture_thread.join()

  return pd.concat([packets_df, pd.DataFrame([packet_dict])], ignore_index=True)


Status code for https://www.washingtonpost.com: 200
Status code for https://www.ndtv.com: 200
Status code for https://www.ndtv.com: 200
Status code for https://www.ndtv.com: 200
Status code for https://www.ndtv.com: 200
Status code for https://www.ndtv.com: 200
Status code for https://www.ndtv.com: 200
Status code for https://www.ndtv.com: 200


  return pd.concat([packets_df, pd.DataFrame([packet_dict])], ignore_index=True)


Status code for https://www.ndtv.com: 200
Status code for https://www.ndtv.com: 200
Status code for https://www.ndtv.com: 200
Status code for https://www.washingtonpost.com: 200
Status code for https://www.washingtonpost.com: 200


  return pd.concat([packets_df, pd.DataFrame([packet_dict])], ignore_index=True)


Status code for https://www.cnbc.com: 200
Status code for https://www.cnbc.com: 200
Status code for https://www.cnbc.com: 200
Status code for https://www.cnbc.com: 200
Status code for https://www.cnbc.com: 200
Status code for https://www.cnbc.com: 200
Status code for https://www.cnbc.com: 200
Status code for https://www.cnbc.com: 200
Status code for https://www.cnbc.com: 200
Status code for https://www.cnbc.com: 200
Status code for https://www.washingtonpost.com: 200
Status code for https://www.timesofindia.com: 200
Status code for https://www.timesofindia.com: 200
Status code for https://www.timesofindia.com: 200
Status code for https://www.timesofindia.com: 200


  return pd.concat([packets_df, pd.DataFrame([packet_dict])], ignore_index=True)


Status code for https://www.timesofindia.com: 200
Status code for https://www.timesofindia.com: 200
Status code for https://www.timesofindia.com: 200
Status code for https://www.timesofindia.com: 200
Status code for https://www.timesofindia.com: 200
Status code for https://www.timesofindia.com: 200
Status code for https://www.washingtonpost.com: 200
Status code for https://www.express.co.uk: 200
Status code for https://www.express.co.uk: 200
Status code for https://www.express.co.uk: 200
Status code for https://www.express.co.uk: 200
Status code for https://www.express.co.uk: 200
Status code for https://www.washingtonpost.com: 200
Status code for https://www.express.co.uk: 200
Status code for https://www.express.co.uk: 200
Status code for https://www.express.co.uk: 200
Status code for https://www.express.co.uk: 200


  return pd.concat([packets_df, pd.DataFrame([packet_dict])], ignore_index=True)


Status code for https://www.express.co.uk: 200
Status code for https://www.washingtonpost.com: 200
Status code for https://www.cnn.com: 200
Status code for https://www.cnn.com: 200
Status code for https://www.cnn.com: 200
Status code for https://www.cnn.com: 200
Status code for https://www.cnn.com: 200
Status code for https://www.cnn.com: 200


  return pd.concat([packets_df, pd.DataFrame([packet_dict])], ignore_index=True)


Status code for https://www.cnn.com: 200
Status code for https://www.cnn.com: 200
Status code for https://www.cnn.com: 200
Status code for https://www.cnn.com: 200
Status code for https://www.washingtonpost.com: 200
Status code for https://www.news18.com: 200
Status code for https://www.news18.com: 200
Status code for https://www.news18.com: 200
Status code for https://www.news18.com: 200
Status code for https://www.news18.com: 200
Status code for https://www.news18.com: 200
Status code for https://www.news18.com: 200
Status code for https://www.news18.com: 200


  return pd.concat([packets_df, pd.DataFrame([packet_dict])], ignore_index=True)


Status code for https://www.news18.com: 200
Status code for https://www.news18.com: 200
Status code for https://www.washingtonpost.com: 200


  return pd.concat([packets_df, pd.DataFrame([packet_dict])], ignore_index=True)


Status code for https://www.washingtonpost.com: 200
Status code for https://www.nypost.com: 200
Status code for https://www.nypost.com: 200
Status code for https://www.nypost.com: 200
Status code for https://www.nypost.com: 200
Status code for https://www.nypost.com: 200
Status code for https://www.nypost.com: 200
Status code for https://www.nypost.com: 200
Status code for https://www.nypost.com: 200
Status code for https://www.nypost.com: 200
Status code for https://www.nypost.com: 200


In [20]:
# Read the CSV file from captures folder and make a single DataFrame
df = pd.concat([pd.read_csv(f'captures/{website.replace("https://", "").replace("/", "_")}.csv') for website in websites], ignore_index=True)
print(df)

            time  frame_number protocol         source    destination  length  \
0   1.711166e+09             1      TCP   20.44.10.122   192.168.1.53      66   
1   1.711166e+09             2      TLS   192.168.1.53  20.189.173.18      90   
2   1.711166e+09             3      TLS   192.168.1.53   17.57.146.22     106   
3   1.711166e+09             4      TLS  20.189.173.18   192.168.1.53     531   
4   1.711166e+09             5      TCP   192.168.1.53  20.189.173.18      78   
5   1.711166e+09             6      TLS  20.189.173.18   192.168.1.53     531   
6   1.711166e+09             7      TLS    3.68.61.181   192.168.1.53    1145   
7   1.711166e+09             8      TCP   192.168.1.53  20.189.173.18      90   
8   1.711166e+09             9      TCP   192.168.1.53  20.189.173.18      78   
9   1.711166e+09            10      TCP   192.168.1.53    3.68.61.181      66   
10  1.711166e+09             1      TCP            NaN            NaN      86   
11  1.711166e+09            