In [None]:
import sys
sys.path.append("src")

In [None]:
import json
import pandas as pd

file_path = 'log_response.jsonl'
jsonObj = pd.read_json(path_or_buf=file_path, lines=True)

raw_logs = []
for index, row in jsonObj.iterrows():
    batch = row['result']

    
    for result in batch['results']:
        raw_logs.append(result)

df_raw_logs = pd.DataFrame(raw_logs)

metadata_df = df_raw_logs['metadata'].apply(lambda row: {obj['key']: obj.get('value') for obj in row})
labels_df = df_raw_logs['labels'].apply(lambda row: {obj['key']: obj.get('value') for obj in row})
userData_df = df_raw_logs['userData'].apply(lambda row: pd.json_normalize(json.loads(row)).to_dict())

# # Create DataFrames from the extracted dictionaries
userData_df = pd.json_normalize(userData_df)
metadata_df = pd.json_normalize(metadata_df)
labels_df = pd.json_normalize(labels_df)

# # Concatenate the new DataFrames with the original DataFrame
df_logs = pd.concat([df_raw_logs.drop(['metadata', 'labels', 'userData'], axis=1),
                           userData_df, metadata_df, labels_df], axis=1)

df_logs.columns = df_logs.columns.str.replace('.0', '')
df_logs

In [None]:
logs = []
content_col = "logRecord.body"
level_col = "logRecord.severityText"

for index, row in df_logs.iterrows():
    timestamp = row["timestamp"]
    content = row[content_col]
    level = row[level_col]

    log = f"{timestamp} {level}: {content}"
    logs.append(log)

In [None]:
(df_logs['timestamp'] + " " + df_logs[level_col] + ": " + df_logs[content_col]).to_list()

In [None]:
from analyze import analyze_logs

df_enriched_logs, df_templates = analyze_logs(logs)

In [None]:
df_templates

In [None]:
df_final_logs = df_enriched_logs.merge(df_logs, left_index=True, right_index=True)

In [None]:
excluded_columns = ["EventId", "EventTemplate", "ParameterList"]
included_columns = [col for col in df_final_logs.columns if col not in excluded_columns]

def process_template_group(group): 
    total_values = {}

    # Collect unique values from each column in the group
    for col in included_columns:
        try:
            values = group[col].unique().tolist()
            nvalues = len(values)
            if nvalues == len(group):
                continue
            if len(values) > 10:
                values = [f"{nvalues} unique values (too many to display)"]
            concatenated_values = ", ".join(values)
            if len(concatenated_values) > 50:
                concatenated_values = concatenated_values[:50] + "... (truncated)"
            total_values[col] = concatenated_values
        except Exception as e:
            continue

    # Calculate occurrences of the group
    total_values["occurrences"] = len(group)
    
    series = pd.Series(total_values)
    df = pd.DataFrame(series).T
    return df

df_enriched_templates = df_final_logs.groupby("EventTemplate").apply(process_template_group)
df_enriched_templates = df_enriched_templates.reset_index().drop(columns=["level_1", "Content"])
df_enriched_templates['percentage'] = df_enriched_templates['occurrences'] / len(df_final_logs) * 100
df_enriched_templates = df_enriched_templates.sort_values("occurrences", ascending=False)

# Remove log groups with only one occurrence
df_enriched_templates = df_enriched_templates[df_enriched_templates["occurrences"] > 1]


In [None]:
import math

records = df_enriched_templates.to_dict(orient="records")
for record in records:
    keys_to_remove = []
    for key in record.keys():
        is_none = record[key] is None
        is_nan = isinstance(record[key], float) and math.isnan(record[key])

        is_invalid = is_none or is_nan
        if is_invalid:
            keys_to_remove.append(key)
        
    for key in keys_to_remove:
        del record[key]

In [None]:
with open("templates.json", "w") as f:
    json.dump(records, f, indent=2)