# Import necessary libraries

In [75]:
import pandas as pd
import numpy as np
import os

# 1. Data Loading

In [76]:
def parse_log_to_dataframe(log_file_path):
    """
    Parses a log file and converts it into a Pandas DataFrame with all data as strings.

    Parameters:
    log_file_path (str): Path to the log file.

    Returns:
    pd.DataFrame: DataFrame containing the parsed data, with all fields as strings.
    """
    data = []
    columns = []

    # Open and read the log file
    with open(log_file_path, 'r') as file:
        for line in file:
            if line.startswith('#fields'):
                columns = line.strip().split('\x09')[1:]
            elif not line.startswith('#'):
                data.append(line.strip().split('\x09'))

    # Creating the DataFrame with all data as strings
    df = pd.DataFrame(data, columns=columns)

    return df

In [77]:
def load_zeek_logs(log_dir):
    """Load all Zeek logs from the directory."""
    logs = {}
    for file in os.listdir(log_dir):
        if file.endswith(".log"):
            log_path = os.path.join(log_dir, file)
            logs[file.split('.')[0]] = parse_log_to_dataframe(log_path)
    return logs

log_dir = '../data/zeek'
logs = load_zeek_logs(log_dir)

# 2. Data Preprocessing

### Handle timestamps

In [78]:
logs['conn']['ts'] = pd.to_datetime(logs['conn']['ts'], unit='s')
logs['dns']['ts'] = pd.to_datetime(logs['dns']['ts'], unit='s')
logs['ssl']['ts'] = pd.to_datetime(logs['ssl']['ts'], unit='s')
logs['files']['ts'] = pd.to_datetime(logs['files']['ts'], unit='s')
logs['http']['ts'] = pd.to_datetime(logs['http']['ts'], unit='s')
logs['x509']['ts'] = pd.to_datetime(logs['x509']['ts'], unit='s')
logs['x509']['certificate.not_valid_before'] = pd.to_datetime(logs['x509']['certificate.not_valid_before'], unit='s')
logs['x509']['certificate.not_valid_after'] = pd.to_datetime(logs['x509']['certificate.not_valid_after'], unit='s')

  logs['conn']['ts'] = pd.to_datetime(logs['conn']['ts'], unit='s')
  logs['dns']['ts'] = pd.to_datetime(logs['dns']['ts'], unit='s')
  logs['ssl']['ts'] = pd.to_datetime(logs['ssl']['ts'], unit='s')
  logs['files']['ts'] = pd.to_datetime(logs['files']['ts'], unit='s')
  logs['http']['ts'] = pd.to_datetime(logs['http']['ts'], unit='s')
  logs['x509']['ts'] = pd.to_datetime(logs['x509']['ts'], unit='s')
  logs['x509']['certificate.not_valid_before'] = pd.to_datetime(logs['x509']['certificate.not_valid_before'], unit='s')
  logs['x509']['certificate.not_valid_after'] = pd.to_datetime(logs['x509']['certificate.not_valid_after'], unit='s')


### Remove redundant columns

In [79]:
def remove_high_missing_and_constant_columns(df, df_name, threshold=90, placeholder='-'):
    """
    Remove columns from the DataFrame where more than the specified percentage of values are missing
    or contain a specified placeholder value, and also remove columns where all values are the same.

    Parameters:
    - df (pd.DataFrame): The DataFrame from which columns will be removed.
    - threshold (float): The percentage threshold to decide which columns to remove (default is 90%).
    - placeholder (str): The placeholder value to be treated as missing (default is '-').

    Returns:
    - pd.DataFrame: The cleaned DataFrame with columns removed.
    """
    # Replace placeholder values with NaN
    df.replace(placeholder, np.nan, inplace=True)
    
    # Calculate percentage of missing values per column
    missing_percentage = df.isna().mean() * 100
    
    # Identify columns where missing percentage is greater than the threshold
    columns_to_drop_due_to_missing = missing_percentage[missing_percentage >= threshold].index
    
    # Identify columns where all values are the same
    columns_to_drop_due_to_constant = df.columns[df.nunique() == 1]
    
    # Combine both criteria
    columns_to_drop = columns_to_drop_due_to_missing.union(columns_to_drop_due_to_constant)
    
    print(f'Removed {len(columns_to_drop)} columns from {df_name} dataframe')
    
    # Drop the identified columns
    cleaned_df = df.drop(columns=columns_to_drop)
    
    return cleaned_df


logs['conn'] = remove_high_missing_and_constant_columns(logs['conn'], 'conn')
logs['dns'] = remove_high_missing_and_constant_columns(logs['dns'], 'dns')
logs['ssl'] = remove_high_missing_and_constant_columns(logs['ssl'], 'ssl')
logs['files'] = remove_high_missing_and_constant_columns(logs['files'], 'files')
logs['http'] = remove_high_missing_and_constant_columns(logs['http'], 'http')
logs['x509'] = remove_high_missing_and_constant_columns(logs['x509'], 'x509')

Removed 4 columns from conn dataframe
Removed 16 columns from dns dataframe
Removed 6 columns from ssl dataframe
Removed 11 columns from files dataframe
Removed 15 columns from http dataframe
Removed 9 columns from x509 dataframe


  df.replace(placeholder, np.nan, inplace=True)


# 3. Data Saving

In [80]:
logs['conn'].to_csv('../data/zeek_preprocessed/conn.csv', index=False)
logs['dns'].to_csv('../data/zeek_preprocessed/dns.csv', index=False)
logs['ssl'].to_csv('../data/zeek_preprocessed/ssl.csv', index=False)
logs['files'].to_csv('../data/zeek_preprocessed/files.csv', index=False)
logs['http'].to_csv('../data/zeek_preprocessed/http.csv', index=False)
logs['x509'].to_csv('../data/zeek_preprocessed/x509.csv', index=False)