In [1]:
import pandas as pd


In [2]:
df=pd.read_csv('dataset/synthetic_logs.csv') 

In [3]:
df.shape

(2410, 5)

In [4]:
df.source.unique() 


array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [5]:
df.target_label.unique() 

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.cluster import DBSCAN 
from sentence_transformers import SentenceTransformer 

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings= model.encode(df['log_message'].tolist(), show_progress_bar=True)

dbscan=DBSCAN(eps=0.5, min_samples=5, metric='euclidean') 
cluster=dbscan.fit_predict(embeddings) 
df['clusters']=cluster     


Batches:   0%|          | 0/76 [00:00<?, ?it/s]

In [15]:
df[df.clusters==1]  

Unnamed: 0,timestamp,source,log_message,target_label,complexity,clusters
6,3/1/2025 19:14,ModernHR,Shard 6 replication task ended in failure,Error,bert,1
45,5/22/2025 3:17,ThirdPartyAPI,Data replication task for shard 14 did not com...,Error,bert,1
98,12/7/2025 8:23,ModernCRM,Data replication task failed for shard 17,Error,bert,1
104,6/26/2025 16:19,ModernHR,Replication of data to shard 14 failed,Error,bert,1
190,3/6/2025 2:08,BillingSystem,Data replication task for shard 6 did not comp...,Error,bert,1
262,2/2/2025 1:35,ModernHR,Data replication for shard 13 encountered an i...,Error,bert,1
286,2/5/2025 20:05,ThirdPartyAPI,Data replication for shard 16 was unsuccessful,Error,bert,1
297,11/17/2025 18:21,ModernCRM,Shard 2 experienced a replication failure,Error,bert,1
305,6/14/2025 20:13,ModernHR,Replication error occurred for shard 10,Error,bert,1
358,9/20/2025 14:08,ThirdPartyAPI,Replication of data to shard 1 failed,Error,bert,1


In [16]:
cluster_count= df['clusters'].value_counts() 
large_cluster = cluster_count[cluster_count > 10].index
for cluster in large_cluster:
    print(f"Cluster {cluster}:")
    print(df[df['clusters'] == cluster]['log_message'].head(5).to_string(index=False)) 
    print()  

Cluster 0:
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...

Cluster -1:
    Email service experiencing issues with sending
         Unauthorized access to data was attempted
          Email server encountered a sending fault
Multiple bad login attempts detected on user 85...
Alert: brute force login attempt from 192.168.8...

Cluster 9:
nova.metadata.wsgi.server [-] 10.11.21.138,10.1...
nova.metadata.wsgi.server [req-27e91939-3ba4-4d...
nova.metadata.wsgi.server [-] 10.11.21.143,10.1...
nova.metadata.wsgi.server [req-61196723-e034-48...
nova.metadata.wsgi.server [req-7d3eeb2d-3948-43...

Cluster 7:
User User685 logged out.
 User User395 logged in.
 User User225 logged in.
User User494 logged out.
 User User900 logged in.

Cluster 8:
Backup started at 2025-05-14 07:06:55.
Backup star

In [21]:
import re 
def classify_with_regex(log_message):
   regex_patterns={
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action" 
   }
   for pattern, label in regex_patterns.items():
       if re.search(pattern, log_message,re.IGNORECASE):
           return label  
   return None 


In [20]:
classify_with_regex("hey bro")  

'Unknown'

In [22]:
df['regex_label']=df['log_message'].apply(classify_with_regex)
df 

Unnamed: 0,timestamp,source,log_message,target_label,complexity,clusters,regex_label
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,-1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,-1,
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0,
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0,
...,...,...,...,...,...,...,...
2405,2025-08-13 07:29:25,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,bert,0,
2406,1/11/2025 5:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,bert,-1,
2407,2025-08-03 03:07:47,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,bert,9,
2408,11/11/2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error,bert,-1,
