In [1]:
import pandas as pd

df = pd.read_csv('dataset/logs.csv')
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [2]:
df['source'].unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [3]:
df['target_label'].unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [4]:
#Clustering log_messages using DBSCAN.Using sentence transformer to generate embeddings.

In [5]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

In [12]:
# Load the pre-trained SentenceTransformer Model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for log messages
embeddings = model.encode(df['log_message'].tolist())

# DBSCAN Clustering
dbscan = DBSCAN(eps=0.2, min_samples=1, metric='cosine')
clusters = dbscan.fit_predict(embeddings)

# Add cluster labels to the DataFrame
df['cluster'] = clusters

In [13]:
embeddings[0:2]

array([[-1.02939717e-01,  3.35459895e-02, -2.20260434e-02,
         1.55100622e-03, -9.86912940e-03, -1.78956345e-01,
        -6.34409934e-02, -6.01761267e-02,  2.81108730e-02,
         5.99619709e-02, -1.72618106e-02,  1.43368833e-03,
        -1.49559975e-01,  3.15280259e-03, -5.66031225e-02,
         2.71685645e-02, -1.49890687e-02, -3.54037769e-02,
        -3.62936147e-02, -1.45410709e-02, -5.61491819e-03,
         8.75539333e-02,  4.55120951e-02,  2.50963438e-02,
         1.00188032e-02,  1.24267070e-02, -1.39923558e-01,
         7.68695921e-02,  3.14095505e-02, -4.15250845e-03,
         4.36903723e-02,  1.71250105e-02, -8.00951421e-02,
         5.74006326e-02,  1.89091861e-02,  8.55261832e-02,
         3.96398939e-02, -1.34371832e-01, -1.44363695e-03,
         3.06711602e-03,  1.76854104e-01,  4.44891676e-03,
        -1.69273838e-02,  2.24266760e-02, -4.35050540e-02,
         6.09031972e-03, -9.98173840e-03, -6.23971745e-02,
         1.07371574e-02, -6.04891405e-03, -7.14659989e-0

In [14]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,"nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" status: 200 len: 1893 time: 0.2675118",HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,"nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" HTTP status code - 200 len: 211 time: 0.0968180",HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,"nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 ""GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1"" RCODE 200 len: 1874 time: 0.2280791",HTTP Status,bert,0


In [15]:
df[df.cluster == 1].head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
10,8/9/2025 18:58,ModernCRM,Email server encountered a sending fault,Error,bert,1
217,1/22/2025 5:45,BillingSystem,Mail service encountered a delivery glitch,Error,bert,1
248,5/2/2025 23:04,ModernHR,Service disruption caused by email sending error,Critical Error,bert,1
265,3/30/2025 23:53,ModernCRM,Email system had a problem sending emails,Error,bert,1


In [16]:
# We sort clusters by number of records in it. Then print 5 log messages from clusters with more than 10 records.

cluster_counts = df['cluster'].value_counts()
large_clusters = cluster_counts[cluster_counts >= 10].index

for cluster in large_clusters:
    print(cluster)
    print(df[df.cluster == cluster]['log_message'].head().to_string(index=False))
    print()

0
           nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
            nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
      nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb4

In [23]:
import re

def classify_with_regex(log_message):
    regex_patterns = {
        r"User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at.*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully.": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by the user.": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }

    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [24]:
classify_with_regex("User User9934 logged in.")

'User Action'

In [28]:
classify_with_regex("File test.csv uploaded successfully.")

'System Notification'

In [30]:
df['regex_label'] = df['log_message'].apply(classify_with_regex)

In [39]:
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1958, 7)

In [40]:
# these are the logs that haven't been classified by regex. For these we will use BERT or LLM.

In [43]:
# printing target labels having 5 or less logs.

print(df_non_regex['target_label'].value_counts()[df_non_regex['target_label'].value_counts() <= 5].index.tolist())



In [44]:
# both of these come from legacy systems and we will use LLM to classify these.
# we create a new dataframe with non legacy labelled logs.

df_non_legacy = df_non_regex[df_non_regex['source'] != 'LegacyCRM']
df_non_legacy['source'].unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI'], dtype=object)

In [45]:
# we will use BERT for classification of these labels.

In [47]:
filtered_embeddings = model.encode(df_non_legacy['log_message'].tolist())
filtered_embeddings[0:2]

array([[-1.02939680e-01,  3.35459560e-02, -2.20260359e-02,
         1.55098806e-03, -9.86920204e-03, -1.78956300e-01,
        -6.34410605e-02, -6.01761676e-02,  2.81108692e-02,
         5.99619746e-02, -1.72618758e-02,  1.43365108e-03,
        -1.49560079e-01,  3.15280934e-03, -5.66031151e-02,
         2.71685608e-02, -1.49890641e-02, -3.54037397e-02,
        -3.62936519e-02, -1.45410905e-02, -5.61487628e-03,
         8.75538737e-02,  4.55120914e-02,  2.50963476e-02,
         1.00187594e-02,  1.24266604e-02, -1.39923587e-01,
         7.68696070e-02,  3.14095728e-02, -4.15252987e-03,
         4.36902978e-02,  1.71250496e-02, -8.00951049e-02,
         5.74006774e-02,  1.89091545e-02,  8.55262130e-02,
         3.96398902e-02, -1.34371817e-01, -1.44364324e-03,
         3.06707597e-03,  1.76854074e-01,  4.44891443e-03,
        -1.69274714e-02,  2.24267151e-02, -4.35050540e-02,
         6.09034253e-03, -9.98167880e-03, -6.23972230e-02,
         1.07371844e-02, -6.04893221e-03, -7.14660734e-0

In [48]:
# we will use these filtered embeddings as the features for predicting thr target label and train a Logistic Regression model.

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

X = filtered_embeddings
y = df_non_legacy['target_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred)
print(report)

                     precision    recall  f1-score   support

     Critical Error       0.95      0.98      0.97        43
              Error       1.00      0.95      0.98        42
        HTTP Status       1.00      1.00      1.00       298
     Resource Usage       1.00      1.00      1.00        57
     Security Alert       0.99      1.00      1.00       130
System Notification       1.00      1.00      1.00        16

           accuracy                           0.99       586
          macro avg       0.99      0.99      0.99       586
       weighted avg       0.99      0.99      0.99       586



In [51]:
import joblib

joblib.dump(clf, '../models/log_classifier.joblib')

['../models/log_classifier.joblib']