In [9]:
# 1) Setup path to include webapp/ as a package
import sys, os
project_root = os.getcwd()
sys.path.insert(0, os.path.join(project_root, 'webapp'))
print('Added to PYTHONPATH:', sys.path[0])

Added to PYTHONPATH: /home/tferreira/Documents/Clustering/flexible-clustering/webapp/webapp


In [10]:
# 1. Make sure your project modules are on PYTHONPATH.
# If this notebook lives outside webapp/, adjust the path below:
# import sys; sys.path.append("/full/path/to/flexible-clustering/webapp")

from clustering.clustering_algorithms import fetch_cowrie_data
from clustering.preprocessing      import is_real_command, abstract_command_line_substitution
from clustering.similarity                     import distance_func
from fish.fishdbc                   import FISHDBC

import numpy as np
import pandas as pd

from sklearn.metrics import silhouette_score, silhouette_samples


In [12]:
# 2. Configure your data range and honeypot type.
honeypot_type = "cowrie"
from_date    = "2025-01-01T00:00:00"  # ISO format, inclusive
to_date      = "2025-07-17T23:59:59"  # up to yesterday

# 3. Fetch raw logs and filter/abstract
df = fetch_cowrie_data(honeypot_type, from_date, to_date, size=10000)
df = df[df['input'].notna()]

# Keep only “real” shell commands
filtered = [(i, cmd) for i, cmd in enumerate(df['input'].values)
            if is_real_command(cmd)]
indices, raw_cmds = zip(*filtered)
abstracts = [abstract_command_line_substitution(cmd) for cmd in raw_cmds]

print(f"Loaded {len(abstracts)} commands for clustering.")


Loaded 9958 commands for clustering.


In [13]:
# 4. Run FISHDBC on your abstracts
dist_fn = distance_func()          # your semantic distance
fish   = FISHDBC(dist_fn)
fish.update(abstracts)

# cluster() returns (labels, probs, stabilities, condensed_tree, slt, mst)
labels, *_ = fish.cluster()
labels = np.array(labels)
print(f"Cluster labels: {np.unique(labels)}")


Cluster labels: [-1  0  1  2  3  4  5  6  7  8  9 10 11]


In [14]:
labels

array([-1, 10,  7, ...,  6, -1,  4], shape=(9958,))

In [16]:
# 5. Build the pairwise distance matrix (O(n²) cost!)
n = len(abstracts)
D = np.zeros((n, n))

for i in range(n):
    for j in range(i+1, n):
        d = dist_fn(abstracts[i], abstracts[j])
        D[i, j] = D[j, i] = d

D = D - D.min()
# 6. Compute overall silhouette score
sil_avg = silhouette_score(D, labels, metric="precomputed")
print(f"Average silhouette score: {sil_avg:.3f}")


KeyboardInterrupt: 

In [24]:
S_df = pd.read_csv("databases/UpdatedSimilarity.csv", index_col=0)

In [27]:
from clustering.load_data import load_command_resources

# this gives you (… , similarity_matrix, …)
_, similarity_matrix, _, _ = load_command_resources()
dist_fn = distance_func()   # wraps geometric_distance(cmd1, cmd2, similarity_matrix)


In [28]:
import pandas as pd

# load the token–token matrix
S_tokens = pd.read_csv("databases/UpdatedSimilarity.csv", index_col=0)

# convert to nested dict[str,dict[str,float]]
similarity_matrix = {
    token: S_tokens.loc[token].to_dict()
    for token in S_tokens.index
}

# now recreate your dist_fn
from clustering.similarity import geometric_distance
def dist_fn(x, y):
    return geometric_distance(x, y, similarity_matrix)


In [29]:
import numpy as np
from sklearn.metrics import pairwise_distances, silhouette_score

# abstracts is your list of N=9958 abstracted commands, in the same order as labels
X = np.array(abstracts)[:, None]   # shape (N,1), each entry is a string

# compute the N×N distance matrix in parallel
D = pairwise_distances(
    X,
    metric=lambda a, b: dist_fn(a[0], b[0]),
    n_jobs=-1
)

# sanity: zero out the diagonal
np.fill_diagonal(D, 0)


KeyboardInterrupt: 

In [31]:
import numpy as np
from sklearn.metrics import silhouette_score

# Number of samples
N = len(abstracts)

# 1) Build a numeric “feature” array of shape (N,1) whose only feature is the sample index
X_idx = np.arange(N, dtype=int).reshape(-1, 1)

# 2) Define a metric that maps back to your command strings
def idx_dist(a, b):
    # a and b are 1‑d numpy arrays of length 1, dtype float
    i = int(a[0])
    j = int(b[0])
    return dist_fn(abstracts[i], abstracts[j])

# 3) Call silhouette_score without precomputing the full D
sil = silhouette_score(
    X_idx,
    labels,
    metric=idx_dist,
    sample_size=1000,    # sample up to 1000 points to speed things up
    random_state=42,
    n_jobs=-1
)

print(f"Approximate silhouette (1 000 samples): {sil:.3f}")


Approximate silhouette (1 000 samples): 0.502


In [None]:
import numpy as np
from sklearn.metrics import silhouette_score
from clustering.similarity import distance_func
from fish.fishdbc         import FISHDBC

# -----------------------------------------------------------------------------
# 1) Prepare the “index” feature array and the idx→command metric
# -----------------------------------------------------------------------------
N = len(abstracts)
X_idx = np.arange(N, dtype=int).reshape(-1, 1)

def idx_dist(a, b):
    i = int(a[0])
    j = int(b[0])
    return dist_fn(abstracts[i], abstracts[j])

# -----------------------------------------------------------------------------
# 2) Sweep FISHDBC parameters and record the best silhouette
# -----------------------------------------------------------------------------
best = {
    'min_samples': None,
    'min_cluster_size': None,
    'method': None,
    'sil_score': -1.0
}

for min_s in [3, 5, 10, 20]:
    # you can also try different m, ef here:
    fish = FISHDBC(distance_func(),
                   min_samples=min_s,
                   m=5, ef=50,
                   vectorized=False)
    # build the index+MST once per setting
    fish.update(abstracts)
    
    for min_cs in [min_s, max(min_s*2, 1), max(min_s*5, 1)]:
        for method in ['eom', 'leaf']:
            labels, *_ = fish.cluster(
                min_cluster_size=min_cs,
                cluster_selection_method=method,
            )
            
            # compute (sampled) silhouette
            sil = silhouette_score(
                X_idx,
                labels,
                metric=idx_dist,
                sample_size=1000,
                random_state=42,
                n_jobs=-1
            )
            
            print(f"min_samples={min_s:2d}, min_cluster_size={min_cs:2d}, "
                  f"method={method:5s} → silhouette={sil:.3f}")
            
            if sil > best['sil_score']:
                best.update({
                    'min_samples':      min_s,
                    'min_cluster_size': min_cs,
                    'method':           method,
                    'sil_score':        sil
                })

print("\n🏆 Best setting:", best)


min_samples= 3, min_cluster_size= 3, method=eom   → silhouette=0.495


In [None]:
import numpy as np
from sklearn.metrics import silhouette_score

# Number of samples
N = len(abstracts)

# 1) Build a numeric “feature” array of shape (N,1) whose only feature is the sample index
X_idx = np.arange(N, dtype=int).reshape(-1, 1)

# 2) Define a metric that maps back to your command strings
def idx_dist(a, b):
    # a and b are 1‑d numpy arrays of length 1, dtype float
    i = int(a[0])
    j = int(b[0])
    return dist_fn(abstracts[i], abstracts[j])

# 3) Call silhouette_score without precomputing the full D
sil = silhouette_score(
    X_idx,
    labels,
    metric=idx_dist,
    random_state=42,
    n_jobs=-1
)

print(f"Approximate silhouette all samples :) : {sil:.3f}")


In [25]:
sil = silhouette_score(D_clean, labels_clean, metric="precomputed")
print(f"Silhouette = {sil:.3f}")


IndexError: index 178 is out of bounds for axis 0 with size 178

In [18]:
D

array([[2.22044605e-16, 1.00000000e+00, 1.00000000e+00, ...,
        1.00000000e+00, 1.00000000e+00, 1.00000000e+00],
       [1.00000000e+00, 2.22044605e-16, 9.99999999e-01, ...,
        9.99999999e-01, 9.99999999e-01, 9.99999999e-01],
       [1.00000000e+00, 9.99999999e-01, 2.22044605e-16, ...,
        9.99999999e-01, 9.99999999e-01, 9.99999999e-01],
       ...,
       [1.00000000e+00, 9.99999999e-01, 9.99999999e-01, ...,
        2.22044605e-16, 2.22044605e-16, 2.22044605e-16],
       [1.00000000e+00, 9.99999999e-01, 9.99999999e-01, ...,
        2.22044605e-16, 2.22044605e-16, 2.22044605e-16],
       [1.00000000e+00, 9.99999999e-01, 9.99999999e-01, ...,
        2.22044605e-16, 2.22044605e-16, 2.22044605e-16]],
      shape=(9958, 9958))

In [19]:
# 7. Per‐sample silhouette and plot
sil_vals = silhouette_samples(D, labels, metric="precomputed")

import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(8, 5))
y_lower = 10

for cluster_id in np.unique(labels):
    ith_vals = np.sort(sil_vals[labels == cluster_id])
    size    = ith_vals.shape[0]
    y_upper = y_lower + size

    ax.fill_betweenx(np.arange(y_lower, y_upper), 0, ith_vals, alpha=0.7)
    ax.text(-0.05, y_lower + 0.5 * size, str(cluster_id))
    y_lower = y_upper + 10

ax.axvline(x=sil_avg, color='red', linestyle='--')
ax.set_title("Silhouette Plot for FISHDBC Clusters")
ax.set_xlabel("Silhouette Coefficient")
ax.set_ylabel("Cluster Label")
ax.set_yticks([])
ax.set_xlim([-0.1, 1])
plt.show()


ModuleNotFoundError: No module named 'matplotlib'