# RQ1 Evaluation Notebook

Internal clustering metrics (silhouette, tree‐structure, entropy) using:
- Semantic similarity (precomputed matrix)
- Baseline (Levenshtein)

In [1]:
# 1) Setup path to include webapp/
import sys, os
project_root = os.getcwd()
sys.path.insert(0, os.path.join(project_root, 'webapp'))
print('PYTHONPATH[0] =', sys.path[0])

PYTHONPATH[0] = /home/tferreira/Documents/Clustering/flexible-clustering/webapp/webapp


In [9]:
# 2) Imports
from clustering.clustering_algorithms import run_clustering, filtered_commands_global,fetch_cowrie_data
from clustering.preprocessing import abstract_command_line_substitution, is_real_command
from clustering.similarity import distance_func
from clustering.evaluation_metrics import (evaluate_clustering, extract_labels_from_tree, compute_tree_metrics, compute_purpose_entropy)
import pandas as pd, numpy as np

In [4]:
# 3) Run clustering on sample
cluster_results, cluster_tree = run_clustering(size=10000)  # adjust for speed

Clustered 0 commands into 15 cluster entries.


In [10]:
df = fetch_cowrie_data(honeypot_type="cowrie", from_date="2021-04-08T00:00:00.000Z", to_date="2025-04-08T00:00:00.000Z", size=10000)
df = df[df['input'].notna()]
commands = df['input'].values
filtered_commands = [(i, cmd) for i, cmd in enumerate(commands) if is_real_command(cmd)]
abstracts = [abstract_command_line_substitution(cmd) for _, cmd in filtered_commands]

In [11]:
print(f'Clustered {len(abstracts)} commands into {len(cluster_results)} cluster entries.')

Clustered 9875 commands into 15 cluster entries.


---
## A) Semantic distance via matrix

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import silhouette_score
from clustering.evaluation_metrics import extract_labels_from_tree, compute_tree_metrics, compute_purpose_entropy

# # 1) Load your precomputed similarity matrix
# #    (adjust path if needed)
sim_df = pd.read_csv("databases/UpdatedSimilarity.csv", index_col=0)
dist_mat = 1-sim_df.values

# # 3) Build the distance matrix: distance = 1 − similarity
# dist_mat = 1.0 - sim_df.values
# print("Distance matrix shape:", dist_mat.shape)

In [None]:
# Other metrics
tree_metrics = compute_tree_metrics(cluster_tree)
entropy_metrics = compute_purpose_entropy(cluster_results)
print('Tree metrics:', tree_metrics)
print('Entropy metrics:', entropy_metrics)

---
## B) Using evaluate_clustering + dist_func

In [None]:
# Define dist_func from sim_df
def matrix_dist(a, b): return 1.0 - sim_df.at[a, b]
full_sem = evaluate_clustering(abstracts, cluster_results, cluster_tree, matrix_dist)
print('Full semantic metrics:')
print(pd.Series(full_sem))

In [None]:
# Baseline Levenshtein
!pip install python-Levenshtein
from Levenshtein import distance as lev_dist
def baseline(a,b): return lev_dist(a,b)/max(len(a),len(b),1)
baseline_metrics = evaluate_clustering(abstracts, cluster_results, cluster_tree, baseline)
print('Baseline metrics:')
print(pd.Series(baseline_metrics))

---
## C) Compare & Visualize

In [None]:
# Tabulate
df = pd.DataFrame([full_sem, baseline_metrics], index=['Semantic','Baseline'])
df

In [None]:
# Plot silhouette
import matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
plt.bar(df.index, df['silhouette'])
plt.ylabel('Silhouette Score')
plt.title('Semantic vs Baseline')
plt.show()

In [24]:
from clustering.similarity       import distance_func
from clustering.evaluation_metrics import compute_distance_matrix, extract_labels_from_tree
from sklearn.metrics             import silhouette_score

# 1) Get the semantic distance function that wraps your UpdatedSimilarity.csv
dist_fn = distance_func()

# 2) Build the n×n distance matrix on your instances
dist_mat = compute_distance_matrix(abstracts, dist_fn)

# 3) Extract the flat cluster labels for each instance
labels = extract_labels_from_tree(cluster_tree, len(abstracts))

# 4) Compute silhouette with your precomputed semantic distances
sil = silhouette_score(dist_mat, labels, metric="precomputed")
print(f"Silhouette (semantic): {sil:.4f}")


KeyboardInterrupt: 