In [13]:
# imports required
import os
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [14]:
# sample dataset
sentences = [
	"The new solar power plant has significantly reduced carbon emissions.",
	"Renewable energy sources are critical for a sustainable future.",
	"A cat chasing a laser pointer is a hilarious sight.",
	"My dog loves playing fetch with the tennis ball in the park.",
	"The latest financial reports show unexpected growth in the tech sector.",
	"Market trends indicate a strong recovery for digital services.",
	"The fluffy kittens curled up for a nap in the sunny window.",
	"Analyzing quarterly earnings data is a primary job function for an analyst.",
]

### # Max number of clusters to test. Typically, you don't test K above sqrt(N) or N/2.


In [15]:
MAX_K = len(sentences) // 2
MIN_K = 2  # Silhouette Score requires a minimum of 2 clusters

In [16]:
def find_optimal_k(embeddings: np.ndarray, k_range: set[int]) -> int:
	"""
	Finds the optimal number of clusters (K) using the Silhouette Score.
	The optimal K maximizes the average silhouette score.
	"""
	best_score = -1
	optimal_k = MIN_K

	# Iterate through possible values of K
	for k in k_range:
		try:
			# 1. Fit KMeans model
			# Use 'k-means++' for smarter initialization
			kmeans = KMeans(n_clusters=k, init="k-means++", n_init=10, random_state=42)
			cluster_labels = kmeans.fit_predict(embeddings)

			# 2. Calculate Silhouette Score
			# Use Cosine distance as it's often better suited for sentence embeddings
			score = silhouette_score(embeddings, cluster_labels, metric="cosine")

			# Print intermediate scores for monitoring (optional)
			# print(f"K={k}: Silhouette Score = {score:.4f}")

			# 3. Check if this is the best score found so far
			if score > best_score:
				best_score = score
				optimal_k = k
		except Exception as e:
			# Handle cases where the number of samples is too low for the metric
			print(f"Could not calculate score for K={k}: {e}")
			continue

	print(
		f"\n[INFO] Optimal K found: {optimal_k} (Max Silhouette Score: {best_score:.4f})"
	)
	return optimal_k

In [17]:
def cluster_sentences(sentences: list[str]) -> None:
	"""
	Performs the entire batch clustering process.
	"""
	# Fix the parallelism warning by setting the environment variable programmatically.
	# This must be done before the SBERT model loads/uses the tokenizer.
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	print("[STEP 1/4] Loading SBERT model...")
	# Load a general-purpose sentence transformer model
	model = SentenceTransformer("all-MiniLM-L6-v2")

	print("[STEP 2/4] Generating sentence embeddings...")
	# Convert sentences into numerical vectors
	sentence_embeddings = model.encode(sentences)

	print("[STEP 3/4] Finding optimal K using Silhouette Score...")
	# Determine the range of K values to test
	k_range = range(MIN_K, MAX_K + 1)

	# Ensure there are enough samples to run the Silhouette test
	if len(sentences) < 2 or MAX_K < MIN_K:
		print("[ERROR] Not enough data to cluster (need at least 2 samples).")
		return

	# Find the best K
	optimal_k = find_optimal_k(sentence_embeddings, k_range)

	print(f"[STEP 4/4] Final clustering with K={optimal_k}...")
	# Perform final clustering with the best K
	final_kmeans = KMeans(
		n_clusters=optimal_k, init="k-means++", n_init=10, random_state=42
	)
	cluster_labels = final_kmeans.fit_predict(sentence_embeddings)

	# Group sentences by cluster ID
	clustered_sentences = {}
	for sentence, label in zip(sentences, cluster_labels):
		if label not in clustered_sentences:
			clustered_sentences[label] = []
		clustered_sentences[label].append(sentence)

	# Print the final results in a structured format
	print("\n=============================================")
	print(f"      Clustering Results (Optimal K = {optimal_k})")
	print("=============================================")

	for cluster_id, sentence_list in sorted(clustered_sentences.items()):
		print(f"\n--- Cluster {cluster_id} ({len(sentence_list)} Sentences) ---")
		for sentence in sentence_list:
			print(f"  - {sentence}")

In [18]:
cluster_sentences(sentences)

[STEP 1/4] Loading SBERT model...
[STEP 2/4] Generating sentence embeddings...
[STEP 3/4] Finding optimal K using Silhouette Score...

[INFO] Optimal K found: 3 (Max Silhouette Score: 0.2754)
[STEP 4/4] Final clustering with K=3...

      Clustering Results (Optimal K = 3)

--- Cluster 0 (3 Sentences) ---
  - A cat chasing a laser pointer is a hilarious sight.
  - My dog loves playing fetch with the tennis ball in the park.
  - The fluffy kittens curled up for a nap in the sunny window.

--- Cluster 1 (3 Sentences) ---
  - The latest financial reports show unexpected growth in the tech sector.
  - Market trends indicate a strong recovery for digital services.
  - Analyzing quarterly earnings data is a primary job function for an analyst.

--- Cluster 2 (2 Sentences) ---
  - The new solar power plant has significantly reduced carbon emissions.
  - Renewable energy sources are critical for a sustainable future.
