In [None]:
# ===== conda env: (rapid-23.12) =====
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '2'  # MUST be first!


import grapheno
import cudf
import pandas as pd
import glob
import subprocess
import numpy as np
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial.distance import cdist


print(f"cuDF version: {cudf.__version__}")

grapheno.generate_dummy_data(n_samples = 5000000,
                             n_features = 20,
                             centers = 30,
                             cluster_std=4.0)

# Clean all cache files
print("Cleaning up ALL cache files...")
subprocess.run(['rm', '-f', '*.parquet'], shell=True, check=False)
subprocess.run(['rm', '-f', 'chunk_*.csv'], shell=True, check=False)

print("\nLoading full dataset...")
full_data = pd.read_csv('dummy_data.csv')
print(f"Total samples: {len(full_data)}")

# Process in chunks
feature_cols = [f'feature{i+1}' for i in range(20)]
chunk_size = 10000  # Small chunks to fit in GPU memory
n_neighbors = 30   # Reduced to save memory
# ==================================================
# STEP 1: Cluster each chunk with grapheno
# ==================================================
print("\n" + "="*60)
print("STEP 1: Clustering each chunk with grapheno")
print("="*60)

chunk_results = []  # Store (chunk_data, cluster_centroids)
all_centroids = []  # All centroids from all chunks
centroid_metadata = []  # Track which chunk and cluster each centroid came from

total_chunks = (len(full_data) + chunk_size - 1) // chunk_size

for i in range(0, len(full_data), chunk_size):
    chunk_num = i // chunk_size + 1
    print(f"\nChunk {chunk_num}/{total_chunks}")
    
    # Extract chunk
    chunk = full_data.iloc[i:i+chunk_size].copy()
    chunk_file = f'chunk_{chunk_num}.csv'
    chunk.to_csv(chunk_file, index=False)
    
    try:
        # Cluster this chunk
        df_chunk = grapheno.cluster(chunk_file, 
                                     feature_cols,
                                     n_neighbors=n_neighbors,
                                     distributed_knn=False,
                                     distributed_graphs=False, 
                                     min_size=5)
        
        df_chunk_pd = df_chunk.to_pandas() if isinstance(df_chunk, cudf.DataFrame) else df_chunk
        
        # Compute centroids for this chunk
        chunk_centroids = {}
        for cluster_id in df_chunk_pd['cluster'].unique():
            if cluster_id >= 0:  # Ignore noise
                cluster_points = df_chunk_pd[df_chunk_pd['cluster'] == cluster_id][feature_cols]
                centroid = cluster_points.mean().values
                chunk_centroids[cluster_id] = centroid
                
                # Store centroid with metadata
                all_centroids.append(centroid)
                centroid_metadata.append({
                    'chunk_num': chunk_num,
                    'original_cluster': cluster_id,
                    'start_idx': i,
                    'end_idx': min(i + chunk_size, len(full_data))
                })
        
        chunk_results.append({
            'chunk_num': chunk_num,
            'data': df_chunk_pd,
            'centroids': chunk_centroids,
            'start_idx': i,
            'end_idx': min(i + chunk_size, len(full_data))
        })
        
        print(f"✅ Found {len(chunk_centroids)} clusters")
        
    except Exception as e:
        print(f"❌ Failed: {e}")
        import traceback
        traceback.print_exc()
        
    finally:
        # Cleanup
        try:
            os.remove(chunk_file)
            for cache in glob.glob('*.parquet'):
                os.remove(cache)
        except:
            pass

# ==================================================
# STEP 2: Merge similar clusters across chunks
# ==================================================
print("\n" + "="*60)
print("STEP 2: Merging similar clusters")
print("="*60)

all_centroids = np.array(all_centroids)
print(f"Total centroids from all chunks: {len(all_centroids)}")

# Use hierarchical clustering to group similar centroids
# Distance threshold determines how similar centroids must be to merge
distance_threshold = 2.0  # Adjust based on your data scale

hierarchical = AgglomerativeClustering(
    n_clusters=None,
    distance_threshold=distance_threshold,
    linkage='average',
    metric='euclidean'
)

merged_cluster_labels = hierarchical.fit_predict(all_centroids)
n_merged_clusters = len(np.unique(merged_cluster_labels))

print(f"Merged into {n_merged_clusters} global clusters")

# Create mapping: (chunk_num, original_cluster) -> merged_cluster
cluster_mapping = {}
for i, meta in enumerate(centroid_metadata):
    key = (meta['chunk_num'], meta['original_cluster'])
    cluster_mapping[key] = merged_cluster_labels[i]

# ==================================================
# STEP 3: Apply merged cluster labels to all data
# ==================================================
print("\n" + "="*60)
print("STEP 3: Applying merged cluster labels")
print("="*60)

final_clusters = np.zeros(len(full_data), dtype=int)

for chunk_info in chunk_results:
    chunk_num = chunk_info['chunk_num']
    chunk_data = chunk_info['data']
    start_idx = chunk_info['start_idx']
    end_idx = chunk_info['end_idx']
    
    # Map each point's original cluster to merged cluster
    for idx, row in chunk_data.iterrows():
        original_cluster = row['cluster']
        if original_cluster >= 0:
            key = (chunk_num, original_cluster)
            merged_cluster = cluster_mapping.get(key, -1)
            final_clusters[start_idx + idx] = merged_cluster
        else:
            final_clusters[start_idx + idx] = -1  # Noise

# Create final dataframe
final_df = full_data.copy()
final_df['cluster'] = final_clusters

print(f"\n✅ Complete!")
print(f"Total samples: {len(final_df)}")
print(f"Original clusters (before merge): {len(all_centroids)}")
print(f"Final clusters (after merge): {n_merged_clusters}")
print(f"\nCluster distribution:")
print(final_df['cluster'].value_counts().sort_index().head(10))

# Save results
final_df.to_csv('clustered_results.csv', index=False)
print(f"\nSaved to clustered_results.csv")

# Convert to cuDF
df = cudf.from_pandas(final_df)
df.head()

In [2]:
import cuml
import holoviews as hv
hv.extension('bokeh')

X = df[[i for i in df.columns if 'feature' in i]].sample(50000)

tsne = cuml.TSNE(random_state=42, n_neighbors=300)
tsne_coords = tsne.fit_transform(X).to_pandas().rename(columns={0:'TSNE1',1:'TSNE2'})
tsne_coords['label'] = df['label'][X.index].to_numpy()
tsne_coords['cluster'] = df['cluster'][X.index].to_numpy()
sample = tsne_coords.sample(5000)

plots = [hv.Scatter(sample,
                    'TSNE1',
                   ['TSNE2',i]).opts(cmap='glasbey',
                                          color=hv.dim(i),
                                          padding=0.05,
                                          height=500,
                                          title=i,
                                          width=500,
                                          xaxis=None,
                                          yaxis=None)

    for i in ['label','cluster']]

hv.Layout(plots)


  return func(**kwargs)


[W] [14:44:43.573697] # of Nearest Neighbors should be at least 3 * perplexity. Your results might be a bit strange...


In [3]:
import cuml
import holoviews as hv
hv.extension('bokeh')

# Using UMAP
X = df[[i for i in df.columns if 'feature' in i]].sample(50000)

umap = cuml.UMAP(random_state=42, n_neighbors=20)
umap_coords = umap.fit_transform(X).to_pandas().rename(columns={0:'UMAP1',1:'UMAP2'})
umap_coords['label'] = df['label'][X.index].to_numpy()
umap_coords['cluster'] = df['cluster'][X.index].to_numpy()
sample = umap_coords.sample(5000)

plots = [hv.Scatter(sample,
                    'UMAP1',
                   ['UMAP2',i]).opts(cmap='glasbey',
                                          color=hv.dim(i),
                                          padding=0.05,
                                          height=500,
                                          title=i,
                                          width=500,
                                          xaxis=None,
                                          yaxis=None)

    for i in ['label','cluster']]

hv.Layout(plots)