# Syslog format files analysis

In [5]:
import random
from datetime import datetime, timedelta

def generate_fake_syslog_entry():
    timestamp = datetime.now() - timedelta(days=random.randint(1, 30), hours=random.randint(1, 24))
    hostname = f'host_{random.randint(1, 10)}'
    message = f'Message from {hostname}: Event {random.randint(1, 100)} occurred.'
    
    return f'{timestamp.strftime("%b %d %H:%M:%S")} {hostname} {message}\n'

def generate_fake_syslog_file(file_path, num_entries=10000):
    with open(file_path, 'w') as file:
        for _ in range(num_entries):
            entry = generate_fake_syslog_entry()
            file.write(entry)

# 使用例
file_path = './data/fake_syslog3.txt'
generate_fake_syslog_file(file_path)


### For huge syslog files

In [None]:
import re
from datetime import datetime

def parse_syslog_entry(entry):
    # syslogエントリの正規表現
    syslog_regex = r'(?P<timestamp>^\w{3} \d{1,2} \d{2}:\d{2}:\d{2}) (?P<hostname>\S+) (?P<message>.+$)'
    
    match = re.match(syslog_regex, entry)
    
    if match:
        timestamp_str, hostname, message = match.groups()
        
        # syslogタイムスタンプをPythonのdatetimeオブジェクトに変換
        timestamp = datetime.strptime(timestamp_str, '%b %d %H:%M:%S')
        
        return {
            'timestamp': timestamp,
            'hostname': hostname,
            'message': message
        }
    else:
        return None

def analyze_syslog_file(file_path):
    with open(file_path, 'r') as file:
        for line in file:
            entry = parse_syslog_entry(line)
            if entry:
                print(f'Timestamp: {entry["timestamp"]}, Hostname: {entry["hostname"]}, Message: {entry["message"]}')

# example
file_path = './data/fake_syslog.txt'
analyze_syslog_file(file_path)


In [11]:
import pandas as pd
import re
from datetime import datetime
import glob

def parse_syslog_entry(entry):
    syslog_regex = r'(?P<timestamp>^\w{3} \d{1,2} \d{2}:\d{2}:\d{2}) (?P<hostname>\S+) (?P<message>.+$)'
    match = re.match(syslog_regex, entry)
    
    if match:
        timestamp_str, hostname, message = match.groups()
        timestamp = datetime.strptime(timestamp_str, '%b %d %H:%M:%S')
        return {
            'timestamp': timestamp,
            'hostname': hostname,
            'message': message
        }
    else:
        return None

def process_syslog_file(file_path):
    data = {'timestamp': [], 'hostname': [], 'message': []}
    
    with open(file_path, 'r') as file:
        for line in file:
            entry = parse_syslog_entry(line)
            if entry:
                data['timestamp'].append(entry['timestamp'])
                data['hostname'].append(entry['hostname'])
                data['message'].append(entry['message'])
    
    return pd.DataFrame(data)

def merge_syslog_files(file_paths):
    dfs = [process_syslog_file(file_path) for file_path in file_paths]
    return pd.concat(dfs, ignore_index=True)

# example
file_paths = glob.glob('./data/syslog/*.txt')
df = merge_syslog_files(file_paths)
df.head()

Unnamed: 0,timestamp,hostname,message
0,1900-10-11 17:38:56,host_6,Message from host_6: Event 91 occurred.
1,1900-10-13 08:38:56,host_3,Message from host_3: Event 25 occurred.
2,1900-10-13 02:38:56,host_1,Message from host_1: Event 97 occurred.
3,1900-10-13 10:38:56,host_1,Message from host_1: Event 54 occurred.
4,1900-10-22 13:38:56,host_8,Message from host_8: Event 12 occurred.


## Experiment

In [2]:
import pandas as pd
import re

def preprocess_definition_file(definition_file_path):
    # remove format sentences and convert to regex
    df = pd.read_csv(definition_file_path, delimiter='\t')
    df['processed_message'] = df['Error Message'].apply(preprocess_message)
    return df[['Error Code', 'processed_message']]

def preprocess_message(message):
    # remove format sentences and convert to regex
    formatted_message = re.sub(r'%\d', r'[^ ]+', message)
    return formatted_message

def identify_error_codes(message_file_path, definition_file_path, output_file_path):
    # preprocess def file
    definition_df = preprocess_definition_file(definition_file_path)
    
    # read errror message
    message_df = pd.read_csv(message_file_path, delimiter='\t')
    
    # specifiesed with error codes
    for index, row in definition_df.iterrows():
        pattern = re.compile(row['processed_message'])
        matched_indices = message_df['Error Message'].apply(lambda x: bool(pattern.match(x)))
        message_df.loc[matched_indices, 'Error Code'] = row['Error Code']
    
    # save results
    message_df.to_csv(output_file_path, sep='\t', index=False)

# example
message_file_path = './data/message.tsv'
definition_file_path = './data/def_code.tsv'
output_file_path = './data/output_result.tsv'

identify_error_codes(message_file_path, definition_file_path, output_file_path)