In [None]:
import pandas as pd
import datetime
import json

# log preprocessing

## sshd

In [None]:
def transform_log(log_line, comp):
    prefix, json_part = log_line.split(": ", 1)
    log_data = json.loads(json_part)

    timestamp = log_data[0]
    timestamp = datetime.datetime.fromtimestamp(timestamp)

    log_message = log_data[1]["log"]
    log_message = log_message.split(comp, 1)[1]

    return f"{prefix.split('_')[0]} {timestamp}{log_message}"


In [None]:
with open('logs_out/wp1_sshd', 'r') as infile, open('logs_out/wp1', 'w') as outfile:
    for line in infile:
        transformed_line = transform_log(line, "wp1")
        outfile.write(transformed_line + '\n')

In [None]:
with open('logs_out/wp2_sshd', 'r') as infile, open('logs_out/wp2', 'w') as outfile:
    for line in infile:
        transformed_line = transform_log(line, "wp2")
        outfile.write(transformed_line + '\n')

In [None]:
with open('logs_out/sshd', 'r') as infile, open('logs_out/server', 'w') as outfile:
    for line in infile:
        transformed_line = transform_log(line, "server")
        outfile.write(transformed_line + '\n')

## docker app

In [None]:
def transform_log(log_line):
    prefix, json_part = log_line.split(": ", 1)
    log_data = json.loads(json_part, strict=False)

    timestamp = log_data[0]
    timestamp = datetime.datetime.fromtimestamp(timestamp)

    log_message = log_data[1]["log"]

    return f"{prefix.split('.')[1]} {timestamp} {log_message}"


In [None]:
with open('logs_out/docker.app', 'r') as infile, open('logs_out/app', 'w') as outfile:
    for _, line in enumerate(infile):
        try:
            transformed_line = transform_log(line)
            outfile.write(transformed_line + '\n')
        except Exception as e:
            print(_, e)
            pass

## docker db

In [None]:
def transform_log(log_line):
    prefix, json_part = log_line.split(": ", 1)
    log_data = json.loads(json_part, strict=False)

    timestamp = log_data[0]
    timestamp = datetime.datetime.fromtimestamp(timestamp)

    log_message = log_data[1]["log"]
    if log_message.find("LOG: ") != -1:
        log_message = log_message.split("LOG: ", 1)[1]

    return f"{prefix.split('.')[1]} {timestamp} {log_message}"


In [None]:
with open('logs_out/docker.db', 'r') as infile, open('logs_out/db', 'w') as outfile:
    for _, line in enumerate(infile):
        try:
            transformed_line = transform_log(line)
            outfile.write(transformed_line + '\n')
        except Exception as e:
            print(_, e)
            pass

## docker jupyter

In [None]:
def transform_log(log_line):
    prefix, json_part = log_line.split(": ", 1)
    log_data = json.loads(json_part, strict=False)

    timestamp = log_data[0]
    timestamp = datetime.datetime.fromtimestamp(timestamp)

    log_message = log_data[1]["log"]

    return f"{prefix.split('.')[1]} {timestamp} {log_message}"


In [None]:
with open('logs_out/docker.jupyter', 'r') as infile, open('logs_out/jupyter', 'w') as outfile:
    for _, line in enumerate(infile):
        try:
            transformed_line = transform_log(line)
            outfile.write(transformed_line + '\n')
        except Exception as e:
            print(_, e)
            pass

# log parsing

In [None]:
! python log_parsing.py

Parsing file: ./db




























































































































































































































Total lines:  60539
Processed 1.7% of log lines.
Processed 3.3% of log lines.
Processed 5.0% of log lines.
Processed 6.6% of log lines.
Processed 8.3% of log lines.
Processed 9.9% of log lines.
Processed 11.6% of log lines.
Processed 13.2% of log lines.
Processed 14.9% of log lines.
Processed 16.5% of log lines.
Processed 18.2% of log lines.
Processed 19.8% of log lines.
Processed 21.5% of log lines.
Processed 23.1% of log lines.
Processed 24.8% of log lines.
Processed 26.4% of log lines.
Processed 28.1% of log lines.
Processed 29.7% of log lines.
Processed 31.4% of log lines.
Processed 33.0% of log lines.
Processed 34.7% of log lines.
Processed 36.3% of log lines.
Processed 38.0% of log lines.
Processed 39.6% of log lines.
Processed 41.3% of log line

# combine logs

In [None]:
df1 = pd.read_csv('result/db_structured.csv')
df2 = pd.read_csv('result/jupyter_structured.csv')
df3 = pd.read_csv('result/app_structured.csv')
df4 = pd.read_csv('result/wp1_structured.csv')
df5 = pd.read_csv('result/wp2_structured.csv')
df6 = pd.read_csv('result/server_structured.csv')

In [None]:
df1['Datetime'] = pd.to_datetime(df1['Date'] + ' ' + df1['Time'])
df1.set_index('Datetime', inplace=True)
df2['Datetime'] = pd.to_datetime(df2['Date'] + ' ' + df2['Time'])
df2.set_index('Datetime', inplace=True)
df3['Datetime'] = pd.to_datetime(df3['Date'] + ' ' + df3['Time'])
df3.set_index('Datetime', inplace=True)
df4['Datetime'] = pd.to_datetime(df4['Date'] + ' ' + df4['Time'])
df4.set_index('Datetime', inplace=True)
df5['Datetime'] = pd.to_datetime(df5['Date'] + ' ' + df5['Time'])
df5.set_index('Datetime', inplace=True)
df6['Datetime'] = pd.to_datetime(df6['Date'] + ' ' + df6['Time'])
df6.set_index('Datetime', inplace=True)

In [None]:
combined_df = pd.concat([df1, df2, df3, df4, df5, df6])
combined_df.sort_index(inplace=True)

In [None]:
# fix line id
combined_df['LineId'] = [i for i in range(1, combined_df.shape[0] + 1)]

In [None]:
combined_df.drop(['Date', 'Time'], axis=1, inplace=True)

In [None]:
combined_df.to_csv('combined.csv')
combined_df.shape

(97757, 6)

In [None]:
# save templates
df1 = pd.read_csv('result/db_templates.csv')
df2 = pd.read_csv('result/jupyter_templates.csv')
df3 = pd.read_csv('result/app_templates.csv')
df4 = pd.read_csv('result/wp1_templates.csv')
df5 = pd.read_csv('result/wp2_templates.csv')
df6 = pd.read_csv('result/server_templates.csv')

templates_df = pd.concat([df1, df2, df3, df4, df5, df6])
templates_df.to_csv('templates.csv', index=False)

In [None]:
templates_df.head()

Unnamed: 0,EventId,EventTemplate,Occurrences
0,a5fc2aed,statement: UPDATE <*> SET <*> WHERE <*> = <*>,28217
1,c32030ca,statement: UPDATE dag SET is_active=false WHER...,9098
2,af812c53,statement: DELETE FROM import_error WHERE impo...,9123
3,c8ad4490,statement: UPDATE task_instance SET state='sch...,6192
4,004f521e,statement: UPDATE dag SET is_active=false WHER...,25


# labels

In [None]:
df = pd.read_csv('combined.csv')

In [None]:
df.sample()

Unnamed: 0,Datetime,LineId,Computer,Content,EventId,EventTemplate,ParameterList
83765,2024-05-11 20:03:21.000000,83766,db,statement: UPDATE dag SET is_active=false WHER...,c32030ca,statement: UPDATE dag SET is_active=false WHER...,"[""'/opt/airflow/dags/prediction.py'"", ""('predi..."


In [None]:
df['Label'] = 0

In [None]:
mask = df['Content'].str.contains('192.168.56.1', na=False)
filtered_df = df[mask]
filtered_df = filtered_df[filtered_df['Computer'] == 'app']

# anomaly 4
mask1 = filtered_df['Content'].str.contains('endpoint1', na=False)
# anomaly 2
mask2 = filtered_df['Content'].str.contains('endpoint3', na=False)
an4 = filtered_df[mask1].index
an2 = filtered_df[mask2].index

df.iloc[an4, -1] = 4
df.iloc[an2, -1] = 2

In [None]:
mask = df['Content'].str.contains('temp1', na=False)
filtered_df = df[mask]
# anomalies 1
an1 = filtered_df.index
df.iloc[an1, -1] = 1

In [None]:
df.to_csv('comb_structured.csv')

In [None]:
df['Label'].value_counts()

Label1
0    97703
4       27
1       17
2       10
Name: count, dtype: int64