# Read Data and separate and create feature using time  

In [12]:
import pandas as pd 
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# --- Load ---
df = pd.read_csv('BGL_2k.log_structured.csv')

# --- Single time axis from UNIX seconds (robust and fast) ---
df['ts'] = pd.to_datetime(df['Timestamp'], unit='s', errors='coerce')

# --- Calendar features from ts ---
df['hour'] = df['ts'].dt.hour
df['day'] = df['ts'].dt.day
df['weekday'] = df['ts'].dt.weekday        # Monday=0
df['week'] = df['ts'].dt.isocalendar().week.astype(int)

# --- Stable sort for time-based ops ---
df['_orig_idx'] = np.arange(len(df))
df_sorted = df.sort_values(['Node', 'ts'], kind='stable').copy()

# --- Time since last event per Node (seconds) ---
delta_node = df_sorted.groupby('Node')['ts'].diff()
df_sorted['time_since_last_event'] = delta_node.dt.total_seconds()

# --- Rolling event frequency over last W seconds (per Node) ---
W = 60  # window seconds
df_sorted['_one'] = 1
df_sorted['rolling_event_frequency'] = (
    df_sorted
    .groupby('Node', group_keys=False)
    .apply(lambda g: g.rolling(f'{W}s', on='ts')['_one'].sum())
)

# --- Recent event score per (Node, EventId): exp(-Δt/τ) ---
tau = 30.0  # seconds
delta_evt = df_sorted.groupby(['Node', 'EventId'])['ts'].diff().dt.total_seconds()
df_sorted['recent_event_score'] = np.exp(-(delta_evt / tau)).fillna(0.0)

# --- Event burst per (Node, EventId): inter-arrival < threshold ---
threshold = 30.0  # seconds
df_sorted['event_burst'] = (delta_evt < threshold).astype('Int8')

# --- Replace NaN values with 0 in the numeric columns ---
df_sorted['time_since_last_event'].fillna(0, inplace=True)
df_sorted['rolling_event_frequency'].fillna(0, inplace=True)
df_sorted['recent_event_score'].fillna(0, inplace=True)
df_sorted['event_burst'].fillna(0, inplace=True)

# --- Restore original order & clean up ---
df_final = (
    df_sorted
    .sort_values('_orig_idx', kind='stable')
    .drop(columns=['_orig_idx', '_one'])
)

# --- Preview new features ---
cols_show = [
    'hour', 'day', 'weekday', 'week',
    'time_since_last_event', 'rolling_event_frequency',
    'recent_event_score', 'event_burst'
]

df_time = df_final[cols_show]
print(df_time.head())

   hour  day  weekday  week  time_since_last_event  rolling_event_frequency  \
0    22    3        4    22                    0.0                      1.0   
1    22    3        4    22                    3.0                      2.0   
2    22    3        4    22                  403.0                      1.0   
3    22    3        4    22                    2.0                      2.0   
4    23    3        4    22                    0.0                      1.0   

   recent_event_score  event_burst  
0            0.000000            0  
1            0.904837            1  
2            0.000001            0  
3            0.935507            1  
4            0.000000            0  


# New features in df_time

In [13]:
df_time

Unnamed: 0,hour,day,weekday,week,time_since_last_event,rolling_event_frequency,recent_event_score,event_burst
0,22,3,4,22,0.0,1.0,0.000000e+00,0
1,22,3,4,22,3.0,2.0,9.048374e-01,1
2,22,3,4,22,403.0,1.0,1.465472e-06,0
3,22,3,4,22,2.0,2.0,9.355070e-01,1
4,23,3,4,22,0.0,1.0,0.000000e+00,0
...,...,...,...,...,...,...,...,...
1995,6,27,1,52,4148.0,1.0,8.944368e-61,0
1996,7,27,1,52,3954.0,1.0,5.754231e-58,0
1997,7,27,1,52,87.0,1.0,5.502322e-02,0
1998,9,27,1,52,5981.0,1.0,2.607095e-87,0


# Use Level and Component to create more robust features

In [14]:
import pandas as pd
import numpy as np

# --- Load ---
df = pd.read_csv('BGL_2k.log_structured.csv')

# (Optional) normalise Level text
df['Level'] = df['Level'].astype(str).str.upper().str.strip()

# -------- One-Hot for Component --------
# Keep original 'Component' elsewhere; get_dummies returns new columns and drops the original if used with 'columns'
comp_dummies = pd.get_dummies(df['Component'], prefix='Component', dtype='Int8')

# -------- Ordinal + One-Hot for Level --------
# Define an explicit order; unseen labels become NaN for ordinal and all-zeros for one-hot
level_order = ['DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
level_cat = pd.Categorical(df['Level'], categories=level_order, ordered=True)

# Ordinal (1..len(level_order)); NaN for unknowns
df['Level_Ordinal'] = pd.Series(level_cat.codes).replace(-1, np.nan) + 1

# One-hot on the ordered categorical
level_dummies = pd.get_dummies(level_cat, prefix='Level', dtype='Int8')

# -------- Assemble output --------
df_out = pd.concat([df, comp_dummies, level_dummies], axis=1)

# -------- Preview: only newly added encoding features at the end --------
new_cols = ['Level_Ordinal'] + list(comp_dummies.columns) + list(level_dummies.columns)
df_com_lev = df_out[new_cols].head()
print(df_com_lev)

   Level_Ordinal  Component_APP  Component_DISCOVERY  Component_HARDWARE  \
0            2.0              0                    0                   0   
1            2.0              0                    0                   0   
2            2.0              0                    0                   0   
3            2.0              0                    0                   0   
4            2.0              0                    0                   0   

   Component_KERNEL  Component_MMCS  Level_DEBUG  Level_INFO  Level_WARN  \
0                 1               0            0           1           0   
1                 1               0            0           1           0   
2                 1               0            0           1           0   
3                 1               0            0           1           0   
4                 1               0            0           1           0   

   Level_ERROR  Level_CRITICAL  
0            0               0  
1            0      

# New features using Level and Component

In [15]:
df_com_lev

Unnamed: 0,Level_Ordinal,Component_APP,Component_DISCOVERY,Component_HARDWARE,Component_KERNEL,Component_MMCS,Level_DEBUG,Level_INFO,Level_WARN,Level_ERROR,Level_CRITICAL
0,2.0,0,0,0,1,0,0,1,0,0,0
1,2.0,0,0,0,1,0,0,1,0,0,0
2,2.0,0,0,0,1,0,0,1,0,0,0
3,2.0,0,0,0,1,0,0,1,0,0,0
4,2.0,0,0,0,1,0,0,1,0,0,0


# Using BERT with Content

In [16]:
# -*- coding: utf-8 -*-
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModel

# ------------------------------
# Config
# ------------------------------
CSV_PATH = "BGL_2k.log_structured.csv"
TEXT_COL = "Content"
MODEL_NAME = "bert-base-uncased"
MAX_LEN = 256
BATCH_SIZE = 32  # adjust for your GPU/CPU memory

# ------------------------------
# Load data
# ------------------------------
df = pd.read_csv(CSV_PATH)
texts = df[TEXT_COL].fillna("").astype(str).tolist()

# ------------------------------
# Load model + tokenizer
# ------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModel.from_pretrained(MODEL_NAME).to(device)
model.eval()

# ------------------------------
# Embed in batches (mean-pooled token embeddings)
# ------------------------------
def embed_texts(text_list):
    all_embs = []
    with torch.no_grad():
        for i in range(0, len(text_list), BATCH_SIZE):
            batch = text_list[i:i+BATCH_SIZE]
            enc = tokenizer(
                batch,
                padding=True,
                truncation=True,
                max_length=MAX_LEN,
                return_tensors="pt"
            )
            enc = {k: v.to(device) for k, v in enc.items()}
            out = model(**enc).last_hidden_state               # [B, T, H]
            mask = enc["attention_mask"].unsqueeze(-1)         # [B, T, 1]
            summed = (out * mask).sum(dim=1)                   # [B, H]
            counts = torch.clamp(mask.sum(dim=1), min=1e-6)    # [B, 1]
            embs = (summed / counts).cpu().numpy()             # [B, H]
            all_embs.append(embs)
    return np.vstack(all_embs)

emb = embed_texts(texts)

# ------------------------------
# Build embedding dataframe and merge
# ------------------------------
emb_cols = [f"emb_{i}" for i in range(emb.shape[1])]
embedding_df = pd.DataFrame(emb, columns=emb_cols)

df_bert = embedding_df

df_bert

Unnamed: 0,emb_0,emb_1,emb_2,emb_3,emb_4,emb_5,emb_6,emb_7,emb_8,emb_9,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,-0.238885,-0.279504,-0.446043,0.170861,0.573125,-0.060092,0.436477,0.383467,0.043320,0.173512,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1,-0.238885,-0.279504,-0.446043,0.170861,0.573125,-0.060092,0.436477,0.383467,0.043320,0.173512,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
2,-0.238885,-0.279504,-0.446043,0.170861,0.573125,-0.060092,0.436477,0.383467,0.043320,0.173512,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
3,-0.238885,-0.279504,-0.446043,0.170861,0.573125,-0.060092,0.436477,0.383467,0.043320,0.173512,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
4,0.029313,-0.328955,0.184085,-0.060321,0.180170,-0.038370,0.082713,0.456499,0.127342,-0.070724,...,0.084319,0.010888,-0.166901,-0.509577,0.549783,-0.321617,0.288349,-0.104305,-0.079109,-0.295976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,-0.238885,-0.279504,-0.446043,0.170861,0.573125,-0.060092,0.436477,0.383467,0.043320,0.173512,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1996,-0.238885,-0.279504,-0.446043,0.170861,0.573125,-0.060092,0.436477,0.383467,0.043320,0.173512,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1997,-0.238885,-0.279504,-0.446043,0.170861,0.573125,-0.060092,0.436477,0.383467,0.043320,0.173512,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1998,-0.238885,-0.279504,-0.446043,0.170861,0.573125,-0.060092,0.436477,0.383467,0.043320,0.173512,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316


# Combined all the new features (Time + Level with Component + Bert fro Content)

In [17]:
df_final = pd.concat([df_time.reset_index(drop=True), df_com_lev.reset_index(drop=True), df_bert.reset_index(drop=True)], axis=1)



# df_final = df_time.fillna(0) + df_com_lev.fillna(0) + df_bert.fillna(0)

# df_final = df_time+df_com_lev+df_bert
df_final

Unnamed: 0,hour,day,weekday,week,time_since_last_event,rolling_event_frequency,recent_event_score,event_burst,Level_Ordinal,Component_APP,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,22,3,4,22,0.0,1.0,0.000000e+00,0,2.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1,22,3,4,22,3.0,2.0,9.048374e-01,1,2.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
2,22,3,4,22,403.0,1.0,1.465472e-06,0,2.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
3,22,3,4,22,2.0,2.0,9.355070e-01,1,2.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
4,23,3,4,22,0.0,1.0,0.000000e+00,0,2.0,0,...,0.084319,0.010888,-0.166901,-0.509577,0.549783,-0.321617,0.288349,-0.104305,-0.079109,-0.295976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,6,27,1,52,4148.0,1.0,8.944368e-61,0,,,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1996,7,27,1,52,3954.0,1.0,5.754231e-58,0,,,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1997,7,27,1,52,87.0,1.0,5.502322e-02,0,,,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1998,9,27,1,52,5981.0,1.0,2.607095e-87,0,,,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316


# Check null value and replace with 0

In [18]:
df_final.isnull().values.any()

True

In [19]:
df_final = df_final.fillna(0)

In [20]:
df_final.isnull().values.any()

False

# Final Data with all new features (787)

In [21]:
df_final

Unnamed: 0,hour,day,weekday,week,time_since_last_event,rolling_event_frequency,recent_event_score,event_burst,Level_Ordinal,Component_APP,...,emb_758,emb_759,emb_760,emb_761,emb_762,emb_763,emb_764,emb_765,emb_766,emb_767
0,22,3,4,22,0.0,1.0,0.000000e+00,0,2.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1,22,3,4,22,3.0,2.0,9.048374e-01,1,2.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
2,22,3,4,22,403.0,1.0,1.465472e-06,0,2.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
3,22,3,4,22,2.0,2.0,9.355070e-01,1,2.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
4,23,3,4,22,0.0,1.0,0.000000e+00,0,2.0,0,...,0.084319,0.010888,-0.166901,-0.509577,0.549783,-0.321617,0.288349,-0.104305,-0.079109,-0.295976
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,6,27,1,52,4148.0,1.0,8.944368e-61,0,0.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1996,7,27,1,52,3954.0,1.0,5.754231e-58,0,0.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1997,7,27,1,52,87.0,1.0,5.502322e-02,0,0.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316
1998,9,27,1,52,5981.0,1.0,2.607095e-87,0,0.0,0,...,0.602273,0.099514,0.006004,-0.440757,0.172369,0.299699,0.237069,-0.093876,-0.716298,0.331316


# Checking the Datatype and scale the data

In [22]:
df_final.dtypes

hour                       int32
day                        int32
weekday                    int32
week                       int64
time_since_last_event    float64
                          ...   
emb_763                  float32
emb_764                  float32
emb_765                  float32
emb_766                  float32
emb_767                  float32
Length: 787, dtype: object

In [23]:
from sklearn.preprocessing import StandardScaler
import numpy as np
# 1) Standardize
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_final)

df_scaled

array([[ 1.30378412, -1.20098756,  0.31716748, ...,  0.26540737,
        -2.72061225,  1.14446886],
       [ 1.30378412, -1.20098756,  0.31716748, ...,  0.26540737,
        -2.72061225,  1.14446886],
       [ 1.30378412, -1.20098756,  0.31716748, ...,  0.26540737,
        -2.72061225,  1.14446886],
       ...,
       [-0.74415156,  1.62472701, -1.25167413, ...,  0.26540737,
        -2.72061225,  1.14446886],
       [-0.47109347,  1.62472701, -1.25167413, ...,  0.26540737,
        -2.72061225,  1.14446886],
       [ 0.3480808 , -1.20098756, -1.25167413, ..., -1.5342998 ,
        -0.57145739,  1.18296696]])

# Save this new scaled dataframe in CSV formate

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_final)

# Convert to DataFrame and save
df_scaled_df = pd.DataFrame(df_scaled, columns=df_final.columns)
df_scaled_df.to_csv('df_scaled.csv', index=False)