In [2]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cluster import AgglomerativeClustering
from InstructorEmbedding import INSTRUCTOR
from sentence_transformers import SentenceTransformer

# FLP [88]
# LogSimilarity [52]
# LogCluster [95]
instructor_embedding = INSTRUCTOR('hkunlp/instructor-base')
SBERT = SentenceTransformer('all-mpnet-base-v2')
dataset = pd.read_csv('dataset/merged-manual-unique.csv')
# Custom distance metric function
log_dict = []
for ind in dataset.index:
    log_dict.append(['Represent the Drone Log message for clustering: ', dataset['message'][ind]])

corpus = dataset['message'].to_list()
def custom_distance_metric(log_messages, embedding_model='sbert', is_norm=True, metric='cosine'):
    # Replace this with your own logic to calculate the distance
    # This is just a placeholder example using TfidfVectorizer
    if embedding_model == 'sbert':
        corpus_embeddings = SBERT.encode(log_messages)
        print("SBERT shape: ", corpus_embeddings[0].shape)
    else:
        corpus_embeddings = instructor_embedding.encode(log_messages)
        print("instructor shape: ", corpus_embeddings[0].shape)
    # Normalize the embeddings to unit length
    if is_norm:
        corpus_embeddings = corpus_embeddings /  np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

    return pairwise_distances(corpus_embeddings, corpus_embeddings, metric=metric)


# Compute custom distance matrix
dist_matrix = custom_distance_metric(log_dict, 'sbert', True, 'cosine') # Instructor
# dist_matrix = custom_distance_metric(corpus, 'sbert', True, 'cosine') # Sentence Transformer

# Clustering
clustering_model = AgglomerativeClustering(n_clusters=None,  
                                  affinity='precomputed',
                                  linkage='average', 
                                  distance_threshold=0.05)
clustering_model.fit(dist_matrix)

# unlabeled dataset -> clustering -> initial_label -> human intervention / label correction -> labeled dataset

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


TypeError: INSTRUCTOR._load_sbert_model() got an unexpected keyword argument 'token'

In [1]:
cluster_assignment = clustering_model.labels_
pseudo_label = []
log_message = []
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
    if cluster_id not in clustered_sentences:
        clustered_sentences[cluster_id] = []

    clustered_sentences[cluster_id].append(corpus[sentence_id])
print(len(clustered_sentences))
for i, cluster in clustered_sentences.items():
    print("Cluster ", i+1)
    print(cluster)
    for element in cluster:
        pseudo_label.append(i+1)
        log_message.append(element)
    print("")

NameError: name 'clustering_model' is not defined

In [6]:
print(cluster_assignment)

[350 342 267 407 295  27 411 415 239  27 408 431  49  49 456 334 260 385
  21 377 351 245 266   6  52  52 388   6 289 371 303  28 399 343 418 392
 298  18  18 369  15  28 173 173 288 264 248 307  17  17  17  17  17  17
  15 319 285 270 451 275 417 309 365 296 306 455 439 429 256 313 325 459
 269 355 465 352 281  44 420  44  47  47  47 416 242 453 207 290 236 409
 305 410 243 443  50  50 251  12  12 446  12 315 357 393 457 396 403  46
 367 227 444 383 293 413 358 276 246 379 387  56 339 402 235 344 422 286
 406  14 322 441 327 368 353 345 433  38  38 405 467 370 304 241 378 184
 423 437 463 280 434 257 194  31 366 240 328 247 249 294 397 438 234 274
 183  76  76  91 346 279 324 347 321 238 466   7 122 301 278 172 432 287
 202 282 359 394  40  40 430 440  25  65  23 462  84 326 414  65  65  23
  62  84  65  62  33 216 221 219 268 160  33 114 114 316 337 412 464 215
 468 310  21  56  21 398  34  34 204  19 428  19 400 120 254   0   0   0
   0   0   0   5   0  13   5  13   5  13  13   5  1

In [7]:
# Evaluation
from sklearn.metrics.cluster import adjusted_mutual_info_score
from sklearn.metrics import fowlkes_mallows_score
testing = pd.read_excel('dataset/merged-manual-unique.xlsx')
testing['predicted_cluster'] = cluster_assignment
testing = testing[testing['cluster'].notna()]

ami_score_data = adjusted_mutual_info_score(testing['cluster'], testing['predicted_cluster'])
print(f"ami_score_data: {ami_score_data}")

# Assuming true_labels and predicted_labels are the ground truth and predicted cluster labels, respectively
fm_score = fowlkes_mallows_score(testing['cluster'], testing['predicted_cluster'])
print(f"fm_score: {fm_score}")


ami_score_data: 0.804308555605715
fm_score: 0.754514995423922


In [8]:
testing

Unnamed: 0,message,label,cluster,predicted_cluster
0,A passenger aircraft is approaching. Descend a...,4,40.0,350
1,A passenger aircraft is nearby. Fly with caution,2,41.0,342
2,Abnormal compass function or GPS signal detect...,3,42.0,267
3,Accelerator is Over Range,3,43.0,407
4,Account not logged in. Flight altitude and dis...,2,44.0,295
...,...,...,...,...
266,Downlink Restored (after 0m 2.2s).,1,39.0,68
267,Downlink Restored (after 0m 2.3s).,1,39.0,68
268,Downlink Restored (after 0m 2.9s).,1,39.0,68
269,Downlink Restored (after 0m 5.6s).,1,39.0,68


In [9]:
testing.to_excel('evaluation/instructor-xl-norm-cosine-005.xlsx', index=False)