In [6]:
import pandas as pd

In [7]:
df = pd.read_csv("../dataset/logs.csv")
df.head()

Unnamed: 0,timestamp,source,log_message,target_label
0,27-06-2025 7.20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert
3,12-07-2025 0.24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status
4,02-06-2025 18.25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status


In [8]:
# unique sources
print(df.source.unique())

['ModernCRM' 'AnalyticsEngine' 'ModernHR' 'BillingSystem' 'ThirdPartyAPI'
 'LegacyCRM']


In [9]:
# unique target labels
print(df.target_label.unique())

['HTTP Status' 'Critical Error' 'Security Alert' 'Error'
 'System Notification' 'Resource Usage' 'User Action' 'Workflow Error'


In [11]:
# printing log_messages of target_label as System Notification
df[df['target_label']=="System Notification"].head()

Unnamed: 0,timestamp,source,log_message,target_label
7,10-11-2025 8.44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification
14,01-04-2025 1.43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification
15,05-01-2025 9.41,ModernCRM,Backup completed successfully.,System Notification
30,4/26/2025 7:54,AnalyticsEngine,Backup started at 2025-05-14 07:06:55.,System Notification
36,11/19/2025 13:14,BillingSystem,System reboot initiated by user User243.,System Notification


In [12]:
df[df.log_message.str.startswith("System reboot initiated by user")]

Unnamed: 0,timestamp,source,log_message,target_label
36,11/19/2025 13:14,BillingSystem,System reboot initiated by user User243.,System Notification
92,12-04-2025 21.20,BillingSystem,System reboot initiated by user User471.,System Notification
139,05-08-2025 16.34,ModernHR,System reboot initiated by user User216.,System Notification
140,09-11-2025 8.49,AnalyticsEngine,System reboot initiated by user User639.,System Notification
161,3/31/2025 19:40,BillingSystem,System reboot initiated by user User819.,System Notification
163,06-06-2025 15.29,BillingSystem,System reboot initiated by user User938.,System Notification
307,04-12-2025 0.41,BillingSystem,System reboot initiated by user User929.,System Notification
365,10/20/2025 22:32,ModernHR,System reboot initiated by user User533.,System Notification
508,4/15/2025 2:04,ThirdPartyAPI,System reboot initiated by user User591.,System Notification
552,9/22/2025 20:54,ModernHR,System reboot initiated by user User421.,System Notification


In [13]:
# Regex pattern
regex_pattern = "User User\d+ logged (in|out)."
df[df['log_message'].str.match(regex_pattern)]

Unnamed: 0,timestamp,source,log_message,target_label
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action
57,9/14/2025 3:03,AnalyticsEngine,User User395 logged in.,User Action
85,3/13/2025 2:11,ModernHR,User User225 logged in.,User Action
88,03-08-2025 19.04,AnalyticsEngine,User User494 logged out.,User Action
126,11/22/2025 21:09,ThirdPartyAPI,User User900 logged in.,User Action
...,...,...,...,...
2207,10-04-2025 8.06,ModernCRM,User User495 logged in.,User Action
2263,2/27/2025 14:40,AnalyticsEngine,User User429 logged out.,User Action
2275,3/13/2025 17:17,AnalyticsEngine,User User755 logged out.,User Action
2323,12-01-2025 18.17,ThirdPartyAPI,User User882 logged out.,User Action


In [14]:
regex_pattern = "Backup (started|ended) at .*"
df[df['log_message'].str.match(regex_pattern)]

Unnamed: 0,timestamp,source,log_message,target_label
30,4/26/2025 7:54,AnalyticsEngine,Backup started at 2025-05-14 07:06:55.,System Notification
44,6/20/2025 0:53,BillingSystem,Backup started at 2025-02-15 20:00:19.,System Notification
108,03-04-2025 21.15,BillingSystem,Backup ended at 2025-08-08 13:06:23.,System Notification
128,11/25/2025 16:45,BillingSystem,Backup started at 2025-11-14 08:27:43.,System Notification
191,05-06-2025 23.58,ThirdPartyAPI,Backup started at 2025-12-09 10:19:11.,System Notification
...,...,...,...,...
2304,11-10-2025 9.39,ModernHR,Backup started at 2025-09-08 22:36:53.,System Notification
2341,1/27/2025 22:19,ThirdPartyAPI,Backup started at 2025-05-13 09:01:52.,System Notification
2346,8/26/2025 10:04,ThirdPartyAPI,Backup ended at 2025-05-15 10:56:20.,System Notification
2363,10/26/2025 23:44,ModernCRM,Backup ended at 2025-09-19 08:52:02.,System Notification


## Clustering messages based on the log_message

In [15]:
from sklearn.cluster import DBSCAN
from sentence_transformers import SentenceTransformer

In [16]:
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight embedding model
embeddings = model.encode(df['log_message'].tolist())
print(embeddings[:5])

[[-0.10293962  0.03354594 -0.02202607 ...  0.00457793 -0.04259717
   0.00322621]
 [ 0.00804572 -0.03573923  0.04938739 ...  0.01538319 -0.06230947
  -0.02774666]
 [-0.00908224  0.13003924 -0.05275568 ...  0.02014104 -0.05117098
  -0.02930294]
 [-0.09751046  0.04911299 -0.03977424 ...  0.02477502 -0.03546079
  -0.00018598]
 [-0.10468338  0.05926038 -0.02488499 ...  0.02502055 -0.037193
  -0.0256891 ]]


In [22]:
clustering = DBSCAN(eps=0.2, min_samples=1, metric="cosine").fit(embeddings)
df['cluster'] = clustering.labels_

In [23]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,cluster
0,27-06-2025 7.20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2
3,12-07-2025 0.24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0
4,02-06-2025 18.25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0


In [24]:
len(df.cluster.unique())

136

In [25]:
# Group by cluster to inspect patterns
clusters = df.groupby('cluster')['log_message'].apply(list)
sorted_clusters = clusters.sort_values(key=lambda x: x.map(len), ascending=False)

In [27]:
len(sorted_clusters)

136

In [28]:
print("Clustered Patterns:")
for cluster_id, messages in sorted_clusters.items():
    if len(messages) > 10:
        print(f"Cluster {cluster_id}:")
        for msg in messages[:5]:
            print(f"  {msg}")

Clustered Patterns:
Cluster 0:
  nova.osapi_compute.wsgi.server [req-b9718cd8-f65e-49cc-8349-6cf7122af137 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" status: 200 len: 1893 time: 0.2675118
  nova.osapi_compute.wsgi.server [req-4895c258-b2f8-488f-a2a3-4fae63982e48 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" HTTP status code -  200 len: 211 time: 0.0968180
  nova.osapi_compute.wsgi.server [req-ee8bc8ba-9265-4280-9215-dbe000a41209 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2/54fadb412c4e40cdbaed9335e4c35a9e/servers/detail HTTP/1.1" RCODE  200 len: 1874 time: 0.2280791
  nova.osapi_compute.wsgi.server [req-f0bffbc3-5ab0-4916-91c1-0a61dd7d4ec2 113d3a99c3da401fbd62cc2caa5b96d2 54fadb412c4e40cdbaed9335e4c35a9e - - -] 10.11.10.1 "GET /v2

## Classifivcation Stage 1: Regex

In [44]:
import re
def classify_with_regex(log_message):
    regex_patterns = {
        r"User User\d+ logged (in|out).": "User Action",
        r"Backup (started|ended) at .*": "System Notification",
        r"Backup completed successfully.": "System Notification",
        r"System updated to version .*": "System Notification",
        r"File .* uploaded successfully by user .*": "System Notification",
        r"Disk cleanup completed successfully.": "System Notification",
        r"System reboot initiated by user .*": "System Notification",
        r"Account with ID .* created by .*": "User Action"
    }

    for pattern, label in regex_patterns.items():
        if re.search(pattern, log_message, re.IGNORECASE):
            return label
    return None

In [48]:
print(classify_with_regex("User User193 logged in."))

User Action


In [47]:
print(classify_with_regex("User User923 logged IN."))

User Action


In [49]:
classify_with_regex("System reboot initiated by user User11299.")

'System Notification'

In [50]:
classify_with_regex("Testing log")

### Apply regex classification

In [51]:
df['regex_label'] = df['log_message'].apply(lambda x: classify_with_regex(x))

In [52]:
# Showing regex labeled data
df[df['regex_label'].notnull()]

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
7,10-11-2025 8.44,ModernHR,File data_6169.csv uploaded successfully by us...,System Notification,4,System Notification
14,01-04-2025 1.43,ThirdPartyAPI,File data_3847.csv uploaded successfully by us...,System Notification,4,System Notification
15,05-01-2025 9.41,ModernCRM,Backup completed successfully.,System Notification,8,System Notification
18,2/22/2025 17:49,ModernCRM,Account with ID 5351 created by User634.,User Action,9,User Action
27,9/24/2025 19:57,ThirdPartyAPI,User User685 logged out.,User Action,11,User Action
...,...,...,...,...,...,...
2376,6/27/2025 8:47,ModernCRM,System updated to version 2.0.5.,System Notification,21,System Notification
2381,09-05-2025 6.39,ThirdPartyAPI,Disk cleanup completed successfully.,System Notification,32,System Notification
2394,04-03-2025 13.13,ModernHR,Disk cleanup completed successfully.,System Notification,32,System Notification
2395,05-02-2025 14.29,ThirdPartyAPI,Backup ended at 2025-05-06 11:23:16.,System Notification,13,System Notification


In [53]:
df[df['regex_label'].notnull()].shape

(500, 6)

## Classification Stage 2: Classification Using Embeddings (Logistic Regression)

In [54]:
# Non regex data
df_non_regex = df[df['regex_label'].isnull()].copy()
df_non_regex.shape

(1910, 6)

In [55]:
# LegacyCRM data
df_legacy = df_non_regex[df_non_regex.source=="LegacyCRM"]
df_legacy

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
60,06-10-2025 16.55,LegacyCRM,Lead conversion failed for prospect ID 7842 du...,Workflow Error,24,
255,03-05-2025 16.55,LegacyCRM,API endpoint 'getCustomerDetails' is deprecate...,Deprecation Warning,48,
377,24-06-2025 12.16,LegacyCRM,Customer follow-up process for lead ID 5621 fa...,Workflow Error,62,
1325,17-04-2025 7.33,LegacyCRM,Escalation rule execution failed for ticket ID...,Workflow Error,105,
1734,30-04-2025 7.47,LegacyCRM,The 'ExportToCSV' feature is outdated. Please ...,Deprecation Warning,118,
1826,23-01-2025 10.33,LegacyCRM,Support for legacy authentication methods will...,Deprecation Warning,122,
2217,12-05-2025 9.46,LegacyCRM,Task assignment for TeamID 3425 could not comp...,Workflow Error,133,


In [56]:
# Non Legacy data
df_non_legacy = df_non_regex[df_non_regex.source!="LegacyCRM"]
df_non_legacy

Unnamed: 0,timestamp,source,log_message,target_label,cluster,regex_label
0,27-06-2025 7.20,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,0,
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,1,
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,2,
3,12-07-2025 0.24,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,0,
4,02-06-2025 18.25,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,0,
...,...,...,...,...,...,...
2405,13-08-2025 7.29,ModernHR,nova.osapi_compute.wsgi.server [req-96c3ec98-2...,HTTP Status,0,
2406,01-11-2025 5.32,ModernHR,User 3844 account experienced multiple failed ...,Security Alert,7,
2407,03-08-2025 3.07,ThirdPartyAPI,nova.metadata.wsgi.server [req-b6d4a270-accb-4...,HTTP Status,0,
2408,11-11-2025 11.52,BillingSystem,Email service affected by failed transmission,Critical Error,1,


In [57]:
print(f"Records in legacy df is {df_legacy.shape[0]}.")
print(f"Records in non legacy df is {df_non_legacy.shape[0]}.")

Records in legacy df is 7.
Records in non legacy df is 1903.


In [58]:
non_legacy_embeddings = model.encode(df_non_legacy.log_message.tolist())
non_legacy_embeddings[:5]

array([[-0.10293962,  0.03354594, -0.02202607, ...,  0.00457793,
        -0.04259717,  0.00322621],
       [ 0.00804572, -0.03573923,  0.04938739, ...,  0.01538319,
        -0.06230947, -0.02774666],
       [-0.00908224,  0.13003924, -0.05275568, ...,  0.02014104,
        -0.05117098, -0.02930294],
       [-0.09751046,  0.04911299, -0.03977424, ...,  0.02477502,
        -0.03546079, -0.00018598],
       [-0.10468338,  0.05926038, -0.02488499, ...,  0.02502055,
        -0.037193  , -0.0256891 ]], shape=(5, 384), dtype=float32)

In [59]:
len(non_legacy_embeddings)

1903

In [60]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

X = non_legacy_embeddings
y = df_non_legacy['target_label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=21)
clf = LogisticRegression(max_iter=150)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))
print(accuracy_score(y_test, y_pred))

                precision    recall  f1-score   support

Critical Error       0.98      1.00      0.99        44
         Error       1.00      0.98      0.99        44
   HTTP Status       1.00      1.00      1.00       311
Resource Usage       1.00      1.00      1.00        52
Security Alert       1.00      1.00      1.00       120

      accuracy                           1.00       571
     macro avg       1.00      1.00      1.00       571
  weighted avg       1.00      1.00      1.00       571

0.9982486865148862


In [62]:
# Saving the model
import joblib
joblib.dump(clf, "../models/logs_classifier.pkl")

['../models/logs_classifier.pkl']

In [67]:
from dotenv import load_dotenv, find_dotenv
from langchain_groq import ChatGroq
from langchain_core.prompts import ChatPromptTemplate

In [77]:
_ = load_dotenv(find_dotenv())
def classify_with_llm(log_msg):
        llm = ChatGroq(model="deepseek-r1-distill-qwen-32b")

        prompt = ChatPromptTemplate.from_messages(
            [
                ("system", "Classify the log message into one of these categories: \
                           (1) Workflow Error, (2) Deprecation Warning.\
                           If you can't figure out a category, use 'Unclassified'.\
                           Put the category inside <category> </category> tags."),
                ("human", "{log_message}")
            ]
        )

        chain = prompt | llm

        output = chain.invoke(
            {
                "log_message": log_msg
            }
        )
        return output.content

In [79]:
response = classify_with_llm("The 'ReportGenerator' module will be retired in version 4.0. Please migrate to the 'AdvancedAnalyticsSuite' by Dec 2025")

In [83]:
match = re.search(r'<category>(.*?)</category>', response, flags=re.DOTALL)
print(match)



In [84]:
match.group(1)

