# RQ1 Evaluation Notebook

Internal clustering metrics (silhouette, tree‐structure, entropy) using:
- Semantic similarity (precomputed matrix)
- Baseline (Levenshtein)

In [None]:
# 1) Setup path to include webapp/
import sys, os
project_root = os.getcwd()
sys.path.insert(0, os.path.join(project_root, 'webapp'))
print('PYTHONPATH[0] =', sys.path[0])

In [None]:
# 2) Imports
from clustering.clustering_algorithms import run_clustering, filtered_commands_global
from clustering.preprocessing import abstract_command_line_substitution
from clustering.similarity import distance_func
from clustering.evaluation_metrics import (evaluate_clustering, extract_labels_from_tree, compute_tree_metrics, compute_purpose_entropy)
import pandas as pd, numpy as np

In [None]:
# 3) Run clustering on sample
cluster_results, cluster_tree = run_clustering(size=2000)  # adjust for speed
# 4) Build abstracts
abstracts = [abstract_command_line_substitution(cmd) for _, cmd in filtered_commands_global]
print(f'Clustered {len(abstracts)} commands into {len(cluster_results)} cluster entries.')

---
## A) Semantic distance via matrix

In [None]:
# Load precomputed similarity
from clustering.load_data import load_command_resources
_, sim_df, _, _ = load_command_resources()
# Align with abstracts
sim_df = sim_df.loc[abstracts, abstracts]
# Build distance matrix = 1 - similarity
dist_mat = 1.0 - sim_df.values
print('Distance matrix shape:', dist_mat.shape)

In [None]:
# Compute silhouette (precomputed)
from sklearn.metrics import silhouette_score
labels = extract_labels_from_tree(cluster_tree, len(abstracts))
sil_sem = silhouette_score(dist_mat, labels, metric='precomputed')
print(f'Silhouette (semantic): {sil_sem:.4f}')

In [None]:
# Other metrics
tree_metrics = compute_tree_metrics(cluster_tree)
entropy_metrics = compute_purpose_entropy(cluster_results)
print('Tree metrics:', tree_metrics)
print('Entropy metrics:', entropy_metrics)

---
## B) Using evaluate_clustering + dist_func

In [None]:
# Define dist_func from sim_df
def matrix_dist(a, b): return 1.0 - sim_df.at[a, b]
full_sem = evaluate_clustering(abstracts, cluster_results, cluster_tree, matrix_dist)
print('Full semantic metrics:')
print(pd.Series(full_sem))

In [None]:
# Baseline Levenshtein
!pip install python-Levenshtein
from Levenshtein import distance as lev_dist
def baseline(a,b): return lev_dist(a,b)/max(len(a),len(b),1)
baseline_metrics = evaluate_clustering(abstracts, cluster_results, cluster_tree, baseline)
print('Baseline metrics:')
print(pd.Series(baseline_metrics))

---
## C) Compare & Visualize

In [None]:
# Tabulate
df = pd.DataFrame([full_sem, baseline_metrics], index=['Semantic','Baseline'])
df

In [None]:
# Plot silhouette
import matplotlib.pyplot as plt
plt.figure(figsize=(6,4))
plt.bar(df.index, df['silhouette'])
plt.ylabel('Silhouette Score')
plt.title('Semantic vs Baseline')
plt.show()