In [1]:
import pandas as pd
df=pd.read_csv('dataset/synthetic_logs.csv')


In [2]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [3]:
df.drop(columns=['complexity'],inplace=True)

In [4]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status


In [5]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [6]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [7]:
from sentence_transformers import SentenceTransformer


  from .autonotebook import tqdm as notebook_tqdm


In [12]:
from sklearn.cluster import DBSCAN
import numpy as np

#loading pretrained sentence transformer model
model=SentenceTransformer('all-MiniLM-L6-v2')
#generating embeddings for logmessages
embeddings=model.encode(df['log_message'].tolist())

#performng dbscan clustering
dbscan=DBSCAN(eps=0.2,min_samples=1,metric='euclidean')
clusters=dbscan.fit_predict(embeddings)

df['cluster']=clusters

In [10]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,-1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,-1
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0


In [21]:
df[df['cluster']==0]

Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
12,2025-07-12 03:03:18,ModernHR,nova.osapi_compute.wsgi.server [req-d4f8d0c2-4...,HTTP Status,0
16,2025-11-15 14:48:30,BillingSystem,nova.osapi_compute.wsgi.server [req-6fe0e366-f...,HTTP Status,0
19,2025-07-10 01:17:59,BillingSystem,nova.osapi_compute.wsgi.server [req-945d1f31-a...,HTTP Status,0
...,...,...,...,...,...
2371,2025-03-21 14:03:47,ModernCRM,nova.osapi_compute.wsgi.server [req-1239a305-a...,HTTP Status,0
2374,2025-10-08 19:22:16,BillingSystem,nova.osapi_compute.wsgi.server [req-86058deb-b...,HTTP Status,0
2384,2025-06-15 11:55:18,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-31a940b9-3...,HTTP Status,0
2389,2025-11-07 19:51:09,ThirdPartyAPI,nova.osapi_compute.wsgi.server [req-410ed8a3-3...,HTTP Status,0


In [42]:
import re
def classify_with_regex(log_message):
    regex_patterns={
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully.": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "User Action",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern,label in regex_patterns.items():
        if re.search(pattern,log_message,re.IGNORECASE):
            return label
    return "nan"

In [43]:
classify_with_regex("User user123 logged out.")

'User Action'

In [44]:
df['regex_label']=df['log_message'].apply(classify_with_regex)

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,Other
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,Other
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2,Other
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,Other
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,3,Other
...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,4,Other
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,1061,Other
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,172,Other
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,1062,Other


In [46]:
df_non_regex=df[df.regex_label=='nan'].copy()

In [47]:
df_non_regex.shape

(1910, 6)

In [48]:
df_non_legacy=df_non_regex[df_non_regex.source!='LegacyCRM']
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [49]:
filtered_embeddings=model.encode(df_non_legacy['log_message'].tolist())

In [51]:
X=filtered_embeddings
y=df_non_legacy['target_label']

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)
clf=LogisticRegression(max_iter=1000)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
report=classification_report(y_test,y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.91      1.00      0.95        48
         Error       0.98      0.89      0.93        47
   HTTP Status       1.00      1.00      1.00       304
Resource Usage       1.00      1.00      1.00        49
Security Alert       1.00      0.99      1.00       123

      accuracy                           0.99       571
     macro avg       0.98      0.98      0.98       571
  weighted avg       0.99      0.99      0.99       571



In [53]:
import joblib 
joblib.dump(clf,'../models/log_classifier.joblib')

['../models/log_classifier.joblib']