In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("dataset/synthetic_logs.csv")
df

Unnamed: 0,timestamp,source,log_message,target_label
0,27-06-2025 07:20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,12-07-2025 00:24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,02-06-2025 18:25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status
...,...,...,...,...
2405,13-08-2025 07:29,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status
2406,01-11-2025 05:32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert
2407,03-08-2025 03:07,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status
2408,11-11-2025 11:52,BillingSystem,Email service affected by failed transmission,Critical Error


In [3]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [4]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [6]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN
import numpy as np


model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(df['log_message'].tolist())


In [8]:
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)

df['clusters'] = clusters
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,clusters
0,27-06-2025 07:20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2
3,12-07-2025 00:24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,02-06-2025 18:25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0


In [10]:
df[df.clusters == 0].head()

Unnamed: 0,timestamp,source,log_message,target_label,clusters
0,27-06-2025 07:20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
3,12-07-2025 00:24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,02-06-2025 18:25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0
5,09-10-2025 10:30,ModernHR,nova.osapi_compute.wsgi.server [req-f0bffbc3-5...,HTTP Status,0
9,30-03-2025 04:01,ModernHR,nova.osapi_compute.wsgi.server [req-2bf7cfee-a...,HTTP Status,0


In [14]:
cluster_counts = df['clusters'].value_counts()
large_clusters = cluster_counts[cluster_counts > 10].index
for cluster in large_clusters:
    print(f"Cluster {cluster}:")
    print(df[df['clusters'] == cluster]['log_message'].head(5).to_string(index=False))
    print()



Cluster 0:
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...

Cluster 5:
nova.compute.claims [req-a07ac654-8e81-416d-bfb...
nova.compute.claims [req-d6986b54-3735-4a42-907...
nova.compute.claims [req-72b4858f-049e-49e1-b31...
nova.compute.claims [req-5c8f52bd-8e3c-41f0-95a...
nova.compute.claims [req-d38f479d-9bb9-4276-968...

Cluster 11:
User User685 logged out.
 User User395 logged in.
 User User225 logged in.
User User494 logged out.
 User User900 logged in.

Cluster 13:
Backup started at 2025-05-14 07:06:55.
Backup started at 2025-02-15 20:00:19.
  Backup ended at 2025-08-08 13:06:23.
Backup started at 2025-11-14 08:27:43.
Backup started at 2025-12-09 10:19:11.

Cluster 7:
Multiple bad login attempts detected on user 85...
Multiple login failures occurred on user 9052 a...
  User 

In [15]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }
    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message ,re.IGNORECASE):
            return label
    return None

In [16]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)

In [17]:
df_non_regex = df[df['regex_label'].isnull()].copy()

In [20]:
print(df_non_regex['target_label'].value_counts()[df_non_regex['target_label'].value_counts()<=5].index.tolist())



In [22]:
df_non_legacy = df_non_regex[df_non_regex.source!='LegacyCRM']
df_non_legacy.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [23]:
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())
filtered_embeddings[:2]

array([[-1.02939717e-01,  3.35459895e-02, -2.20260434e-02,
         1.55100622e-03, -9.86912940e-03, -1.78956345e-01,
        -6.34409934e-02, -6.01761267e-02,  2.81108730e-02,
         5.99619709e-02, -1.72618106e-02,  1.43368833e-03,
        -1.49559975e-01,  3.15280259e-03, -5.66031225e-02,
         2.71685645e-02, -1.49890687e-02, -3.54037769e-02,
        -3.62936147e-02, -1.45410709e-02, -5.61491819e-03,
         8.75539333e-02,  4.55120951e-02,  2.50963438e-02,
         1.00188032e-02,  1.24267070e-02, -1.39923558e-01,
         7.68695921e-02,  3.14095505e-02, -4.15250845e-03,
         4.36903723e-02,  1.71250105e-02, -8.00951421e-02,
         5.74006326e-02,  1.89091861e-02,  8.55261832e-02,
         3.96398939e-02, -1.34371832e-01, -1.44363695e-03,
         3.06711602e-03,  1.76854104e-01,  4.44891676e-03,
        -1.69273838e-02,  2.24266760e-02, -4.35050540e-02,
         6.09031972e-03, -9.98173840e-03, -6.23971745e-02,
         1.07371574e-02, -6.04891405e-03, -7.14659989e-0

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
X = filtered_embeddings
y = df_non_legacy['target_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=4)
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                precision    recall  f1-score   support

Critical Error       0.97      1.00      0.98        57
         Error       0.98      0.96      0.97        54
   HTTP Status       1.00      1.00      1.00       305
Resource Usage       1.00      1.00      1.00        55
Security Alert       1.00      0.99      0.99       100

      accuracy                           0.99       571
     macro avg       0.99      0.99      0.99       571
  weighted avg       0.99      0.99      0.99       571



In [27]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [28]:
import joblib
joblib.dump(clf, '../models/log_classifier.joblib')

['models/log_classifier.joblib']