In [1]:
import os
import gzip
import pandas as pd
import re
import json


def parse_zeek_log(file_path):
    """
    Parses a Zeek log file into a pandas DataFrame along with metadata.
    """
    with gzip.open(file_path, 'rt') as f:
        lines = f.readlines()
    
    # Extract metadata
    metadata = {}
    for line in lines:
        if line.startswith("#"):
            parts = line[1:].split("\t", 1)
            if len(parts) == 2:
                metadata[parts[0].strip()] = parts[1].strip()
    
    # Extract headers
    headers_line = next(line for line in lines if line.startswith("#fields"))
    headers = headers_line.split("\t")[1:]  # Extract headers after #fields
    headers = [header.strip() for header in headers]

    # Extract data rows
    data_lines = [line.strip().split("\t") for line in lines if not line.startswith("#")]
    
    # Create DataFrame
    df = pd.DataFrame(data_lines, columns=headers)

    # Replace Zeek placeholders for missing data
    df.replace({'(empty)': None, '-': None}, inplace=True)

    # Convert specific types based on metadata if available
    if "#types" in metadata:
        types = metadata["#types"].split("\t")
        for col, dtype in zip(headers, types):
            if dtype == "time":
                df[col] = pd.to_datetime(df[col], unit="s")
            elif dtype in {"interval", "count"}:
                df[col] = pd.to_numeric(df[col], errors="coerce")
            elif dtype == "bool":
                df[col] = df[col] == "T"

    return df, metadata

def save_to_parquet_and_json(df, metadata, output_dir, base_name):
    """
    Saves the DataFrame to Parquet and metadata to JSON.
    """
    os.makedirs(output_dir, exist_ok=True)

    # Save DataFrame as Parquet
    parquet_file = os.path.join(output_dir, f"{base_name}.parquet")
    df.to_parquet(parquet_file, index=False)
    print(f"Data saved to {parquet_file}")

    # Save metadata as JSON
    metadata_file = os.path.join(output_dir, f"{base_name}_metadata.json")
    with open(metadata_file, "w") as f:
        json.dump(metadata, f, indent=4)
    print(f"Metadata saved to {metadata_file}")

# Example usage
file_path = "../zeek/logs/2024-12-16/ssl.10:00:00-11:00:00.log.gz"
# Parse the log file
df, metadata = parse_zeek_log(file_path)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 527 entries, 0 to 526
Data columns (total 19 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   ts                     527 non-null    object
 1   uid                    527 non-null    object
 2   id.orig_h              527 non-null    object
 3   id.orig_p              527 non-null    object
 4   id.resp_h              527 non-null    object
 5   id.resp_p              527 non-null    object
 6   version                462 non-null    object
 7   cipher                 462 non-null    object
 8   curve                  462 non-null    object
 9   server_name            96 non-null     object
 10  resumed                527 non-null    object
 11  last_alert             2 non-null      object
 12  next_protocol          2 non-null      object
 13  established            527 non-null    object
 14  ssl_history            514 non-null    object
 15  cert_chain_fps         

Unnamed: 0,ts,uid,id.orig_h,id.orig_p,id.resp_h,id.resp_p,version,cipher,curve,server_name,resumed,last_alert,next_protocol,established,ssl_history,cert_chain_fps,client_cert_chain_fps,sni_matches_cert,validation_status
0,1734343145.721228,CX98VA2t5shazpJxr,192.168.127.114,55102,172.217.25.202,443,TLSv13,TLS_AES_128_GCM_SHA256,X25519Kyber768Draft00,,F,,,F,s,,,,
1,1734343214.684776,CejX7e4ESZEKkLehpj,192.168.127.114,34966,13.107.246.59,443,TLSv13,TLS_AES_256_GCM_SHA384,secp256r1,main.vscode-cdn.net,T,,,T,CjiICs,,,,
2,1734343089.677217,COOFZq4QGp14bCtj4e,192.168.127.114,53827,142.251.222.227,443,TLSv13,TLS_AES_128_GCM_SHA256,X25519Kyber768Draft00,,F,,,F,s,,,,
3,1734343167.205929,CADPbEO5eUr8KJErj,192.168.127.114,51314,142.251.43.195,443,TLSv13,TLS_AES_128_GCM_SHA256,X25519Kyber768Draft00,,F,,,F,s,,,,
4,1734343168.680245,Cd7ZOb4Lnkga9qEhK2,192.168.127.114,35937,142.251.222.227,443,TLSv13,TLS_AES_128_GCM_SHA256,X25519Kyber768Draft00,,F,,,F,s,,,,
