In [1]:
import os
import gzip
import pandas as pd
import re
import json


def parse_zeek_log(file_path):
    """
    Parses a Zeek log file into a pandas DataFrame along with metadata.
    """
    with gzip.open(file_path, 'rt') as f:
        lines = f.readlines()
    
    # Extract metadata
    metadata = {}
    for line in lines:
        if line.startswith("#"):
            parts = line[1:].split("\t", 1)
            if len(parts) == 2:
                metadata[parts[0].strip()] = parts[1].strip()
    
    # Extract headers
    headers_line = next(line for line in lines if line.startswith("#fields"))
    headers = headers_line.split("\t")[1:]  # Extract headers after #fields
    headers = [header.strip() for header in headers]

    # Extract data rows
    data_lines = [line.strip().split("\t") for line in lines if not line.startswith("#")]
    
    # Create DataFrame
    df = pd.DataFrame(data_lines, columns=headers)

    # Replace Zeek placeholders for missing data
    df.replace({'(empty)': None, '-': None}, inplace=True)

    # Convert specific types based on metadata if available
    if "#types" in metadata:
        types = metadata["#types"].split("\t")
        for col, dtype in zip(headers, types):
            if dtype == "time":
                df[col] = pd.to_datetime(df[col], unit="s")
            elif dtype in {"interval", "count"}:
                df[col] = pd.to_numeric(df[col], errors="coerce")
            elif dtype == "bool":
                df[col] = df[col] == "T"

    return df, metadata

def save_to_parquet_and_json(df, metadata, output_dir, base_name):
    """
    Saves the DataFrame to Parquet and metadata to JSON.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Save DataFrame as Parquet
    parquet_file = os.path.join(output_dir, f"{base_name}.parquet")
    df.to_parquet(parquet_file, index=False)
    print(f"Data saved to {parquet_file}")

    # Save metadata as JSON
    metadata_file = os.path.join(output_dir, f"{base_name}_metadata.json")
    with open(metadata_file, "w") as f:
        json.dump(metadata, f, indent=4)
    print(f"Metadata saved to {metadata_file}")

# Example usage
file_path = "../zeek/logs/2024-12-16/dns.10:00:00-11:00:00.log.gz"
# Parse the log file
df, metadata = parse_zeek_log(file_path)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2625 entries, 0 to 2624
Data columns (total 24 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ts           2625 non-null   object
 1   uid          2625 non-null   object
 2   id.orig_h    2625 non-null   object
 3   id.orig_p    2625 non-null   object
 4   id.resp_h    2625 non-null   object
 5   id.resp_p    2625 non-null   object
 6   proto        2625 non-null   object
 7   trans_id     2625 non-null   object
 8   rtt          572 non-null    object
 9   query        2616 non-null   object
 10  qclass       2331 non-null   object
 11  qclass_name  2331 non-null   object
 12  qtype        2331 non-null   object
 13  qtype_name   2331 non-null   object
 14  rcode        1403 non-null   object
 15  rcode_name   1403 non-null   object
 16  AA           2625 non-null   object
 17  TC           2625 non-null   object
 18  RD           2625 non-null   object
 19  RA           2625 non-null 

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,proto,trans_id,rtt,query,...,rcode,rcode_name,AA,TC,RD,RA,Z,answers,TTLs,rejected
0,1734343200.200302,CuUJzJ4Juyy6xIhHh,192.168.127.114,54767,192.168.127.134,53,udp,28319,,114.127.168.192.in-addr.arpa,...,3,NXDOMAIN,F,F,T,F,0,,,F
1,1734343200.261168,C948dq1BkMtjMaxDE5,192.168.127.114,38190,192.168.127.134,53,udp,57410,,134.127.168.192.in-addr.arpa,...,3,NXDOMAIN,F,F,T,F,0,,,T
2,1734343200.273683,CkH0NS2H7cWj2KiyA5,192.168.127.114,59015,192.168.127.134,53,udp,22173,,6.3.7.7.5.2.c.a.b.c.9.c.f.f.0.e.0.0.0.0.0.0.0....,...,3,NXDOMAIN,F,F,T,F,0,,,T
3,1734343200.310138,Cpdyc42kZCsjBhx1zg,192.168.127.114,51824,192.168.127.134,53,udp,3592,,f.2.3.0.c.4.2.2.c.1.7.f.a.c.0.0.0.0.0.0.0.0.0....,...,3,NXDOMAIN,F,F,T,F,0,,,T
4,1734343200.33011,Cerh184AdqeXJDTiDf,192.168.127.114,40507,192.168.127.134,53,udp,4507,0.038116,10.200.58.216.in-addr.arpa,...,0,NOERROR,F,F,T,T,0,"hkg12s11-in-f10.1e100.net,kul09s16-in-f10.1e10...","180.000000,180.000000",F
