In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA, IncrementalPCA
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN
from sklearn.metrics import davies_bouldin_score
from sklearn.preprocessing import StandardScaler
import re

In [2]:
data_minhashing=pd.read_csv("/Users/madalena/Desktop/minhashing_results_subset.csv")
df=data_minhashing
df.shape


(500000, 2)

In [3]:
df.columns

Index(['signature', 'rating'], dtype='object')

## CLUSTER

DBSCAN


In [4]:
# == 1. Convert stringified signatures into numeric arrays
if isinstance(df['signature'].iloc[0], str):
    df['signature'] = df['signature'].apply(lambda s: np.fromstring(s.strip('[]'), sep=' '))

# Build feature matrix
X = np.vstack(df['signature'].values)
print("Feature matrix shape:", X.shape)


Feature matrix shape: (500000, 100)


In [5]:
# === 2. Dimensionality reduction with PCA ===
pca = PCA(n_components=50, random_state=42)  # keep 90% of variance
X_reduced = pca.fit_transform(X)
print(f"PCA reduced shape: {X_reduced.shape}")
print(f"Explained variance ratio sum: {np.sum(pca.explained_variance_ratio_):.2f}")


PCA reduced shape: (500000, 50)
Explained variance ratio sum: 0.76


In [6]:
# 3 - IF I WANT UMAP
from umap import UMAP

umap_model = umap.UMAP(
    n_neighbors=30,      # controls local vs global structure
    min_dist=0.0,        # tighter clusters
    n_components=5,      # 2–10 recommended
    metric='euclidean',
    random_state=42
)

X_umap = umap_model.fit_transform(X_pca)

NameError: name 'umap' is not defined

In [None]:
# Scale PCA features  AFTER UMAP
from sklearn.preprocessing import StandardScaler
X_umap_scaled = StandardScaler().fit_transform(X_umap)

In [9]:
# 3 - scale PCA features WITHOUT UMAP
from sklearn.preprocessing import StandardScaler
X_scaled = StandardScaler().fit_transform(X_reduced) #(might do X_reduced)


In [39]:
#WITHOUT PRE CLUSTER
# To choose Eps:
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import numpy as np

# Use k=2 because the first neighbor is the point itself
nbrs = NearestNeighbors(n_neighbors=2).fit(X_reduced)
distances, indices = nbrs.kneighbors(X_reduced)

# Take distance to the nearest neighbor (excluding itself)
nearest_dist = np.sort(distances[:,1])  # skip distance to self
#plt.plot(nearest_dist)
#plt.ylabel("Distance to nearest neighbor")
#plt.xlabel("Points sorted by distance")
#plt.show()

In [40]:
eps = np.percentile(nearest_dist, 95)
print(eps)

2736197532.5903726


In [None]:
eps = 2e10 # <-- adjust after checking the curve
min_samples = 15

# --- 3. Run DBSCAN on the post PCA Data ---
dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
clusters = dbscan.fit_predict(X_umap_scaled)
print("DBSCAN cluster labels distribution:")
unique, counts = np.unique(clusters, return_counts=True)
print(dict(zip(unique, counts)))

: 

In [35]:
# WITH SUBSAMPLE FIRST
# pick e.g. 10,000 points for DBSCAN (adjust if needed)
subset_size = 10000
indices_sub = np.random.choice(len(X_scaled), size=subset_size, replace=False)

X_sub = X_scaled[indices_sub]
print("Feature matrix shape:", X_sub.shape)

Feature matrix shape: (10000, 100)


In [None]:
#WITH SUBSAMPLE FIRST
nbrs = NearestNeighbors(n_neighbors=2).fit(X_sub)
distances, _ = nbrs.kneighbors(X_sub)
nearest_dist = np.sort(distances[:,1])

# Pick eps as the 95th percentile
eps = np.percentile(nearest_dist, 95)
print("Estimated eps:", eps)

Estimated eps: 16.775164718596564


In [None]:
#To check what is going on with the code - Seems like its aone big cluster and without UMAP, DBSCAN wont work
for p in [60, 70, 80, 85, 90, 95]:
    eps_test = np.percentile(nearest_dist, p)
    db_test = DBSCAN(eps=eps_test, min_samples=20).fit_predict(X_sub)
    unique, counts = np.unique(db_test, return_counts=True)
    print(f"Percentile {p}, eps={eps_test:.4f}", dict(zip(unique, counts)))

Percentile 60, eps=5.5899 {np.int64(-1): np.int64(4030), np.int64(0): np.int64(5970)}
Percentile 70, eps=7.1167 {np.int64(-1): np.int64(3029), np.int64(0): np.int64(6971)}
Percentile 80, eps=9.5547 {np.int64(-1): np.int64(2023), np.int64(0): np.int64(7977)}
Percentile 85, eps=11.4382 {np.int64(-1): np.int64(1525), np.int64(0): np.int64(8475)}
Percentile 90, eps=13.5394 {np.int64(-1): np.int64(1021), np.int64(0): np.int64(8979)}
Percentile 95, eps=16.7752 {np.int64(-1): np.int64(510), np.int64(0): np.int64(9490)}


In [None]:
#Running DBSCAN in the subsample
min_samples = 20
db = DBSCAN(eps=eps, min_samples=min_samples, metric='euclidean')
labels_sub = db.fit_predict(X_sub)

# Print cluster distribution
unique, counts = np.unique(labels_sub, return_counts=True)
print("Cluster distribution on subset:", dict(zip(unique, counts)))

Cluster distribution on subset: {np.int64(-1): np.int64(1019), np.int64(0): np.int64(8981)}


In [7]:
import hdbscan
from sklearn.preprocessing import StandardScaler

In [10]:
# Run HDBSCAN
clusterer = hdbscan.HDBSCAN(
    min_cluster_size=50,  # smallest allowed cluster
    min_samples=20,       # density threshold, optional
    cluster_selection_method='eom'
)

labels = clusterer.fit_predict(X_scaled)



In [11]:


# Check cluster distribution
import numpy as np
unique, counts = np.unique(labels, return_counts=True)
print("Cluster distribution:", dict(zip(unique, counts)))

Cluster distribution: {np.int64(-1): np.int64(500000)}


## EVALUATION

In [None]:
#EVALUATION using DAVIES-BOULDIN INDEX
from sklearn.metrics import davies_bouldin_score
# --- 3. Evaluate with Davies-Bouldin Index ---
db = davies_bouldin_score(X, clusters)
print(f"Davies-Bouldin Index (DB): {db:.4f}")

In [None]:
#PLOT in two dimensions using PCA and color by cluster
#prepare data 
X = np.array(df['signature'].tolist())  # shape (n_samples, 100)
# --- 4. Optional: visualize clusters ---
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

plt.figure(figsize=(8,6))
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=clusters, cmap='tab10', alpha=0.7)
plt.title("K-Means clustering of MinHash signatures")
plt.xlabel("PCA 1")
plt.ylabel("PCA 2")
plt.colorbar(label="Cluster ID")
plt.show()