In [4]:
!pip install -U datasets pandas "numpy<2" matplotlib seaborn sentence-transformers

Defaulting to user installation because normal site-packages is not writeable


Collecting pandas
  Downloading pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl.metadata (89 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (11 kB)
Collecting python-dateutil>=2.8.2 (from pandas)
  Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl.metadata (5.4 kB)
Downloading pandas-2.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl (66.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.5/66.5 MB[0m [31m367.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading matplotlib-3.10.1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━

In [None]:
from datasets import load_dataset
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
import matplotlib.pyplot as plt
import umap
import faiss
from functools import partial

# Function to generate embeddings in parallel
def embed_text(examples, model_name='all-MiniLM-L6-v2'):
    model = SentenceTransformer(model_name, device='cuda', trust_remote_code=True)

    # Assuming 'question' is the column containing sentences
    embeddings = model.encode(examples['question'])
    return {'embeddings': embeddings}

# Function to find optimal K
def find_optimal_k(embeddings, max_k=20, min_k=2):
    # Combine multiple metrics for robust evaluation
    results = {}
    
    for k in range(min_k, max_k+1):
        # Use FAISS for faster clustering
        kmeans = faiss.Kmeans(d=embeddings.shape[1], k=k, niter=20)
        kmeans.train(embeddings)
        _, labels = kmeans.index.search(embeddings, 1)
        labels = labels.reshape(-1)
        
        # Calculate cluster quality metrics
        sil_score = silhouette_score(embeddings, labels)
        ch_score = calinski_harabasz_score(embeddings, labels)
        db_score = davies_bouldin_score(embeddings, labels)
        
        # Normalized scores (higher is better for all)
        results[k] = sil_score + ch_score/1000 - db_score
    
    return max(results, key=results.get)

# Main processing pipeline
def process_dataset(dataset_name, text_column='question', model_name='all-MiniLM-L6-v2', 
                   dimension_reduction=50, num_proc=40, batch_size=1000):
    
    # 1. Load the dataset with multi-processing
    dataset = load_dataset(dataset_name)
    if isinstance(dataset, dict):
        # If dataset has splits, use the largest one
        largest_split = max(dataset.keys(), key=lambda k: len(dataset[k]))
        dataset = dataset[largest_split]
    
    # 2. Generate embeddings in parallel using all processors
    embed_function = partial(embed_text, model_name=model_name)
    dataset = dataset.map(
        embed_function,
        batched=True,
        batch_size=batch_size,
        num_proc=num_proc
    )
    
    # 3. Convert to numpy array for further processing
    embeddings = np.array(dataset['embeddings'])
    
    # 4. Dimension reduction with UMAP if needed (for very large embeddings)
    if dimension_reduction and dimension_reduction < embeddings.shape[1]:
        reducer = umap.UMAP(n_components=dimension_reduction, n_neighbors=15, 
                           min_dist=0.1, random_state=42, n_jobs=num_proc)
        embeddings = reducer.fit_transform(embeddings)
    
    # 5. Convert to float32 for FAISS compatibility
    embeddings = embeddings.astype('float32')
    
    # 6. Find optimal K
    optimal_k = find_optimal_k(embeddings)
    print(f"Optimal number of clusters: {optimal_k}")
    
    # 7. Final clustering with optimal K
    kmeans = faiss.Kmeans(d=embeddings.shape[1], k=optimal_k, niter=20)
    kmeans.train(embeddings)
    _, labels = kmeans.index.search(embeddings, 1)
    labels = labels.reshape(-1)
    
    # 8. Add cluster labels back to dataset
    dataset = dataset.add_column("cluster", labels.tolist())
    
    return dataset, labels, kmeans.centroids

# Visualization function
def visualize_clusters(embeddings, labels, method='umap'):
    # Create a 2D visualization for exploration
    reducer = umap.UMAP(n_components=2, random_state=42)
    reduced = reducer.fit_transform(embeddings)
    
    plt.figure(figsize=(12, 10))
    plt.scatter(reduced[:, 0], reduced[:, 1], c=labels, cmap='tab20', s=10, alpha=0.7)
    plt.colorbar()
    plt.title(f'Cluster Visualization ({len(np.unique(labels))} clusters)')
    plt.show()
    
    return reduced

# Example usage:
if __name__ == "__main__":
    # Replace with your actual dataset name
    dataset_name = "vinhpx/math_natural_reasoning"  
    
    # Process with optimal parameters
    clustered_dataset, labels, centroids = process_dataset(
        dataset_name=dataset_name,
        text_column='question',  # Replace with your text column name
        model_name='all-mpnet-base-v2',  # Sentence transformer model
        dimension_reduction=None,  # Reduce dimensions for faster processing
        num_proc=1,  # Use all 40 processors
        batch_size=100000  # Adjust batch size based on memory availability
    )
    
    # Get original embeddings for visualization
    embeddings = np.array(clustered_dataset['embeddings'])
    
    # Visualize results
    visualize_clusters(embeddings, labels)
    
    # Print some stats about clusters
    for cluster_id in range(len(np.unique(labels))):
        count = np.sum(labels == cluster_id)
        percentage = count / len(labels) * 100
        print(f"Cluster {cluster_id}: {count} items ({percentage:.2f}%)")
        
    # Get examples from each cluster
    for cluster_id in range(min(3, len(np.unique(labels)))):  # Show first 3 clusters
        examples = clustered_dataset.filter(lambda x: x['cluster'] == cluster_id).select(range(5))
        print(f"\nCluster {cluster_id} examples:")
        for ex in examples:
            print(f"- {ex['question'][:100]}...")  # Show first 100 chars

  from .autonotebook import tqdm as notebook_tqdm


[2025-04-06 16:14:22,849] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -lcufile: No such file or directory
collect2: error: ld returned 1 exit status
Using the latest cached version of the dataset since vinhpx/math_natural_reasoning couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'default' at /home/ubuntu/.cache/huggingface/datasets/vinhpx___math_natural_reasoning/default/0.0.0/80ca8b426eae79f62551418bf77ed94abb6184bd (last modified on Sun Apr  6 16:13:48 2025).
Map:  12%|█▏        | 300000/2559963 [04:51<35:14, 1068.87 examples/s]