In [15]:
import re
import pandas as pd

def extract_log_data(line):
    log_data = {
        "log_timestamp": None,
        "device_name": None,
        "component": None,
        "service": None,
        "severity_code": None,
        "event_type": None
    }

    timestamp_regex = r'(\w{3} \d{2} \d{2}:\d{2}:\d{2}(\.\d+)?(?: \w{3})?)|(\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d+\+\d{2}:\d{2})'
    device_regex = r'(\S+)\s+(\S+)\s+\d+\s'
    component_regex = r'RP/\d+/RP\d+/CPU\d+'
    service_regex = r':(\w+)\['
    severity_event_regex = r'%(\w+)-(\w+)-(\d+)-(\w+)'

    timestamp_match = re.search(timestamp_regex, line)
    device_match = re.search(device_regex, line)
    component_match = re.search(component_regex, line)
    service_match = re.search(service_regex, line)
    severity_event_match = re.search(severity_event_regex, line)

    log_data["log_timestamp"] = timestamp_match.group(0) if timestamp_match else None
    log_data["device_name"] = device_match.group(2) if device_match else None
    log_data["component"] = component_match.group(0) if component_match else None
    log_data["service"] = service_match.group(1) if service_match else None

    if severity_event_match:
        log_data["severity_code"] = severity_event_match.group(3)
        log_data["event_type"] = severity_event_match.group(4)

    return log_data

file_path = 'data/telkomdb_output_july.txt'

log_entries = []
with open(file_path, 'r') as file:
    for line in file:
        extracted_data = extract_log_data(line.strip())
        log_entries.append(extracted_data)

df = pd.DataFrame(log_entries)

print(df)

output_csv_path = 'processed_logs.csv'
df.to_csv(output_csv_path, index=False)


           log_timestamp        device_name      component   service  \
0        Jul 31 23:59:59         61.5.14.56  RP/0/RP0/CPU0      exec   
1        Jul 31 23:59:58         61.5.13.10  RP/0/RP1/CPU0      ospf   
2        Jul 31 23:59:58         61.5.13.10  RP/0/RP1/CPU0  mpls_ldp   
3        Jul 31 23:59:58         61.5.13.10  RP/0/RP1/CPU0       pim   
4        Jul 31 23:59:58         61.5.13.10  RP/0/RP1/CPU0       pim   
...                  ...                ...            ...       ...   
1115517             None                Jul           None      None   
1115518             None  RP/0/RP0/CPU0:Jul  RP/0/RP0/CPU0      None   
1115519             None         61.5.14.56  RP/0/RP0/CPU0       pim   
1115520             None  RP/0/RP1/CPU0:Jul  RP/0/RP1/CPU0      None   
1115521             None  RP/0/RP1/CPU0:Jul  RP/0/RP1/CPU0      None   

        severity_code          event_type  
0                   6               CLOSE  
1                   5              ADJCHG  
2  

In [16]:
df.shape

(1115522, 6)

#