# RQ1 Evaluation Notebook

Internal clustering metrics (silhouette, tree‐structure, entropy) using:
- Semantic similarity (via distance_func wrapping UpdatedSimilarity.csv)
- Baseline (Levenshtein)

In [1]:
# 1) Setup path to include webapp/ as a package
import sys, os
project_root = os.getcwd()
sys.path.insert(0, os.path.join(project_root, 'webapp'))
print('Added to PYTHONPATH:', sys.path[0])

Added to PYTHONPATH: /home/tferreira/Documents/Clustering/flexible-clustering/webapp/webapp


In [6]:
# 2) Imports
from clustering.clustering_algorithms import run_clustering, filtered_commands_global,fetch_cowrie_data
from clustering.preprocessing import abstract_command_line_substitution,is_real_command
from clustering.similarity import distance_func
from clustering.evaluation_metrics import (compute_distance_matrix, extract_labels_from_tree, compute_tree_metrics, compute_purpose_entropy)
from sklearn.metrics import silhouette_score
import pandas as pd, numpy as np

In [7]:
# 3) Run your clustering pipeline
cluster_results, cluster_tree = run_clustering(size=10000)  # adjust for speed
df = fetch_cowrie_data(honeypot_type="cowrie", from_date="2021-04-08T00:00:00.000Z", to_date="2025-04-08T00:00:00.000Z", size=10000)
df = df[df['input'].notna()]
commands = df['input'].values
filtered_commands = [(i, cmd) for i, cmd in enumerate(commands) if is_real_command(cmd)]
abstracts = [abstract_command_line_substitution(cmd) for _, cmd in filtered_commands]
print(f'Clustered {len(abstracts)} commands.')

Clustered 9875 commands.


---
## A) Semantic silhouette via distance_func
Using your UpdatedSimilarity.csv under the hood

In [8]:
# build distance matrix via distance_func
dist_fn = distance_func()
dist_mat = compute_distance_matrix(abstracts, dist_fn)  # n×n
print('Distance matrix shape:', dist_mat.shape)
# extract labels
labels = extract_labels_from_tree(cluster_tree, len(abstracts))
# silhouette with precomputed distances
sil_sem = silhouette_score(dist_mat, labels, metric='precomputed')
print(f'Silhouette (semantic): {sil_sem:.4f}')
# other metrics
tree_m = compute_tree_metrics(cluster_tree)
entropy_m = compute_purpose_entropy(cluster_results)
print('Tree metrics:', tree_m)
print('Entropy metrics:', entropy_m)

KeyboardInterrupt: 

---
## B) Full pipeline via evaluate_clustering

In [None]:
# semantic metrics
full_sem = evaluate_clustering(abstracts, cluster_results, cluster_tree, distance_func())
print('Semantic metrics (wrapped):')
print(pd.Series(full_sem))

In [None]:
# baseline Levenshtein metrics
!pip install python-Levenshtein
from Levenshtein import distance as lev_dist
baseline = lambda a,b: lev_dist(a,b)/max(len(a),len(b),1)
base_metrics = evaluate_clustering(abstracts, cluster_results, cluster_tree, baseline)
print('Baseline metrics:')
print(pd.Series(base_metrics))

---
## C) Compare results

In [None]:
# tabulate comparisons
df = pd.DataFrame([full_sem, base_metrics], index=['Semantic','Baseline'])
df

In [None]:
# plot silhouette only
import matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
plt.bar(df.index, df['silhouette'])
plt.ylabel('Silhouette Score')
plt.title('Semantic vs Baseline')
plt.show()