In [4]:
import os
import pandas as pd
from glob import glob
from tqdm import tqdm

# Define paths
folder_path = r"C:\Users\karun\OneDrive\Documents\RIK\data\TWOS-dataset\network_v_2"
output_path = r"C:\Users\karun\OneDrive\Documents\RIK\outputs\twos_network_summary.csv"

# Collect results
all_data = []

# Get all .csv files in the directory
file_paths = glob(os.path.join(folder_path, "*.csv"))

print(f"Processing {len(file_paths)} network files...")

for file in tqdm(file_paths, desc="Processing network logs"):
    try:
        # Read skipping the first row which is header
        df = pd.read_csv(file, skiprows=1, names=["No", "Time", "Source", "Destination", "Protocol", "Length", "Info"],
                         usecols=["Time", "Source", "Destination", "Protocol"], engine="python", on_bad_lines="skip")

        # Convert time assuming all sessions start on 2017-03-20
        df["timestamp"] = pd.to_datetime("2017-03-20") + pd.to_timedelta(df["Time"], unit="s", errors="coerce")
        df["date_only"] = df["timestamp"].dt.date

        # Drop rows with missing values
        df.dropna(subset=["Source", "date_only", "Protocol"], inplace=True)

        # Group by Source IP (host) and date
        grouped = df.groupby(["Source", "date_only"])

        for (host, date), group in grouped:
            http_count = (group["Protocol"].str.upper() == "HTTP").sum()
            dns_count = (group["Protocol"].str.upper() == "DNS").sum()
            tls_count = group["Protocol"].str.upper().str.startswith("TLS").sum()
            tcp_count = (group["Protocol"].str.upper() == "TCP").sum()
            total_packets = len(group)
            unique_dests = group["Destination"].nunique()

            all_data.append({
                "host": host,
                "date_only": date,
                "http_requests_per_day": http_count,
                "dns_requests_per_day": dns_count,
                "tls_connections_per_day": tls_count,
                "tcp_connections_per_day": tcp_count,
                "total_packets_per_day": total_packets,
                "unique_destinations_per_day": unique_dests
            })

    except Exception as e:
        print(f"Error processing {file}: {e}")

# Convert to DataFrame
summary_df = pd.DataFrame(all_data)

# Save output
if not summary_df.empty:
    summary_df.to_csv(output_path, index=False)
    print(f"\n Network summary saved to:\n{output_path}")
    print(f"Rows: {len(summary_df)}, Columns: {len(summary_df.columns)}")
else:
    print("No data processed. Please check file content.")


Processing 111 network files...


Processing network logs: 100%|██████████| 111/111 [02:01<00:00,  1.09s/it]


 Network summary saved to:
C:\Users\karun\OneDrive\Documents\RIK\outputs\twos_network_summary.csv
Rows: 16481, Columns: 8



