In [2]:
import os
import gzip
import pandas as pd
import re
import json
import numpy as np


def parse_zeek_log(file_path):
    """
    Parses a Zeek log file into a pandas DataFrame along with metadata.
    """
    with gzip.open(file_path, 'rt') as f:
        lines = f.readlines()
    
    # Extract metadata
    metadata = {}
    for line in lines:
        if line.startswith("#"):
            parts = line[1:].split("\t", 1)
            if len(parts) == 2:
                metadata[parts[0].strip()] = parts[1].strip()
    
    # Extract headers
    headers_line = next(line for line in lines if line.startswith("#fields"))
    headers = headers_line.split("\t")[1:]  # Extract headers after #fields
    headers = [header.strip() for header in headers]

    # Extract data rows
    data_lines = [line.strip().split("\t") for line in lines if not line.startswith("#")]
    
    # Create DataFrame
    df = pd.DataFrame(data_lines, columns=headers)

    # Replace Zeek placeholders for missing data
    df.replace({'(empty)': None, '-': None}, inplace=True)

    # Convert specific types based on metadata if available
    if "#types" in metadata:
        types = metadata["#types"].split("\t")
        for col, dtype in zip(headers, types):
            if dtype == "time":
                df[col] = pd.to_datetime(df[col], unit="s")
            elif dtype in {"interval", "count"}:
                df[col] = pd.to_numeric(df[col], errors="coerce")
            elif dtype == "bool":
                df[col] = df[col] == "T"

    return df, metadata

def save_to_parquet_and_json(df, metadata, output_dir, base_name):
    """
    Saves the DataFrame to Parquet and metadata to JSON.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Save DataFrame as Parquet
    parquet_file = os.path.join(output_dir, f"{base_name}.parquet")
    df.to_parquet(parquet_file, index=False)
    print(f"Data saved to {parquet_file}")

    # Save metadata as JSON
    metadata_file = os.path.join(output_dir, f"{base_name}_metadata.json")
    with open(metadata_file, "w") as f:
        json.dump(metadata, f, indent=4)
    print(f"Metadata saved to {metadata_file}")


In [3]:
file_path = "../zeek/logs/2024-12-16/conn.10:00:00-11:00:00.log.gz"
# Parse the log file
df, metadata = parse_zeek_log(file_path)

# Assume df is your DataFrame
# Replace '(empty)' and '-' with NaN for easier datatype conversion
df.replace({'(empty)': np.nan, '-': np.nan}, inplace=True)

# Convert columns to appropriate datatypes
df['ts'] = pd.to_datetime(df['ts'], unit='s')  # Convert timestamp to datetime

df['duration'] = df['id.orig_p'].astype(np.float64) 
df['orig_bytes'] = pd.to_numeric(df['orig_bytes'], errors='coerce')  # Convert origin bytes to float
df['resp_bytes'] = pd.to_numeric(df['resp_bytes'], errors='coerce')  # Convert responder bytes to float
df['orig_pkts'] = pd.to_numeric(df['orig_pkts'], errors='coerce')  # Convert origin packets to integer
df['orig_ip_bytes'] = pd.to_numeric(df['orig_ip_bytes'], errors='coerce')  # Convert origin IP bytes to integer
df['resp_pkts'] = pd.to_numeric(df['resp_pkts'], errors='coerce')  # Convert responder packets to integer
df['resp_ip_bytes'] = pd.to_numeric(df['resp_ip_bytes'], errors='coerce')  # Convert responder IP bytes to integer

# For categorical or boolean columns
df['id.orig_p'] = df['id.orig_p'].astype('category')  # Convert protocol to category
df['id.resp_p'] = df['id.resp_p'].astype('category')  # Convert protocol to category
df['proto'] = df['proto'].astype('category')  # Convert protocol to category
df['service'] = df['service'].astype('category')  # Convert service to category
df['conn_state'] = df['conn_state'].astype('category')  # Convert connection state to category
df['local_orig'] = df['local_orig'].map({'T': True, 'F': False}).astype('boolean')  # Convert to boolean
df['local_resp'] = df['local_resp'].map({'T': True, 'F': False}).astype('boolean')  # Convert to boolean

# Drop columns with no data (optional)
df.drop(columns=['tunnel_parents'], inplace=True)  # This column is entirely null


  df['ts'] = pd.to_datetime(df['ts'], unit='s')  # Convert timestamp to datetime


In [4]:
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2796 entries, 0 to 2795
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   ts             2796 non-null   datetime64[ns]
 1   uid            2796 non-null   object        
 2   id.orig_h      2796 non-null   object        
 3   id.orig_p      2796 non-null   category      
 4   id.resp_h      2796 non-null   object        
 5   id.resp_p      2796 non-null   category      
 6   proto          2796 non-null   category      
 7   service        1956 non-null   category      
 8   duration       2796 non-null   float64       
 9   orig_bytes     1998 non-null   float64       
 10  resp_bytes     1998 non-null   float64       
 11  conn_state     2796 non-null   category      
 12  local_orig     2796 non-null   boolean       
 13  local_resp     2796 non-null   boolean       
 14  missed_bytes   2796 non-null   object        
 15  history        2785 n

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,service,duration,orig_bytes,resp_bytes,conn_state,local_orig,local_resp,missed_bytes,history,orig_pkts,orig_ip_bytes,resp_pkts,resp_ip_bytes
0,2024-12-16 09:59:54.975208998,CgflZ13ybYBXTEgAvi,192.168.127.114,56266,192.168.127.134,53,udp,dns,56266.0,50.0,176.0,SF,True,True,0,Dd,1,78,1,204
1,2024-12-16 09:59:54.975614071,CyU9mpNNl606E5uJi,192.168.127.114,34226,192.168.127.134,53,udp,dns,34226.0,50.0,234.0,SF,True,True,0,Dd,1,78,1,262
2,2024-12-16 09:59:55.037101984,C9yxrt2diu1QPx2Xwa,192.168.127.114,50785,192.168.127.134,53,udp,dns,50785.0,60.0,137.0,SF,True,True,0,Dd,1,88,1,165
3,2024-12-16 09:59:05.721227884,CX98VA2t5shazpJxr,192.168.127.114,55102,172.217.25.202,443,udp,quic,55102.0,7340.0,6784.0,SF,True,False,0,Dd,13,7704,16,7232
4,2024-12-16 10:00:00.200301886,CuUJzJ4Juyy6xIhHh,192.168.127.114,54767,192.168.127.134,53,udp,dns,54767.0,46.0,101.0,SF,True,True,0,Dd,1,74,1,129


In [6]:
import dtale
dtale.show(df)






Setting an item of incompatible dtype is deprecated and will raise in a future error of pandas. Value '[1734317994975 1734317994975 1734317995037 ... 1734321537678 1734321538621
 1734321537781]' has dtype incompatible with datetime64[ns], please explicitly cast to a compatible dtype first.

