In [6]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark import SparkConf
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

import numpy as np
import itertools


In [7]:
from sklearn.datasets import make_blobs

# Generate random dataset with 3 dimensions
n_samples = 100  # Number of samples
n_features = 3  # Number of dimensions
centers = 3  # Number of clusters
random_state = 42  # Random state for reproducibility

X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers, random_state=random_state)

In [8]:
# set parameters
min_pts = 18
eps = 2
dataset = X
b_dataset = sc.broadcast(dataset)
b_eps = sc.broadcast(eps)
b_min_pts = sc.broadcast(min_pts)
partition_each_dim = (2, 2, 2)

In [9]:
def create_all_partition_id(partition_each_dim):
    """
    Create all possible partition IDs based on the specified number of partitions in each dimension.

    Args:
        partition_each_dim (tuple): A tuple specifying the number of partitions in each dimension.

    Returns:
        list: A list of all possible partition IDs.
    """
    return list(itertools.product(*[range(i) for i in partition_each_dim]))

par_ids = create_all_partition_id(partition_each_dim)
par_ids

[(0, 0, 0),
 (0, 0, 1),
 (0, 1, 0),
 (0, 1, 1),
 (1, 0, 0),
 (1, 0, 1),
 (1, 1, 0),
 (1, 1, 1)]

In [10]:
num_par = np.prod(partition_each_dim)

bounds = np.concatenate(([np.min(dataset, axis=0)], [np.max(dataset, axis=0)]), axis=0).T # Get the bounds of the dataset
bounds

array([[-10.83328933,   4.5123324 ],
       [ -8.24949825,  10.41824487],
       [ -8.02990217,   8.01217509]])

In [11]:
bin_bounds = []
for i in range(len(partition_each_dim)):
    Lower_bound = bounds[i][0] - abs(bounds[i][0]) * 0.01
    Upper_bound = bounds[i][-1] + abs(bounds[i][-1]) * 0.01
    dim_bins = np.linspace(Lower_bound, Upper_bound, partition_each_dim[i]+1, endpoint=True)
    bin_bounds.append(dim_bins)
bin_bounds

[array([-10.94162222,  -3.19208325,   4.55745572]),
 array([-8.33199323,  1.09521704, 10.52242732]),
 array([-8.11020119, -0.00895218,  8.09229684])]

In [12]:
indexed_data = []

for id_pts in range(len(dataset)):
    pos_list = []
    for i in range(dataset.shape[1]):
        pos = np.digitize(dataset[id_pts][i], bin_bounds[i]) - 1
        pos_list.append(pos)
    indexed_data.append([tuple(pos_list), id_pts])
    
indexed_data

[[(1, 1, 1), 0],
 [(0, 1, 1), 1],
 [(0, 1, 1), 2],
 [(1, 1, 1), 3],
 [(1, 0, 0), 4],
 [(1, 0, 0), 5],
 [(1, 0, 0), 6],
 [(0, 1, 1), 7],
 [(1, 1, 1), 8],
 [(1, 0, 0), 9],
 [(1, 0, 0), 10],
 [(1, 0, 0), 11],
 [(1, 1, 1), 12],
 [(1, 1, 1), 13],
 [(1, 0, 0), 14],
 [(0, 1, 1), 15],
 [(1, 0, 0), 16],
 [(1, 1, 1), 17],
 [(1, 0, 0), 18],
 [(0, 1, 1), 19],
 [(0, 1, 1), 20],
 [(1, 1, 1), 21],
 [(0, 1, 1), 22],
 [(1, 0, 0), 23],
 [(0, 1, 1), 24],
 [(1, 1, 1), 25],
 [(0, 1, 1), 26],
 [(0, 1, 1), 27],
 [(1, 0, 0), 28],
 [(0, 1, 1), 29],
 [(1, 0, 0), 30],
 [(1, 0, 0), 31],
 [(1, 0, 0), 32],
 [(0, 1, 1), 33],
 [(1, 1, 1), 34],
 [(1, 1, 1), 35],
 [(0, 1, 1), 36],
 [(1, 0, 0), 37],
 [(0, 1, 1), 38],
 [(0, 1, 1), 39],
 [(0, 1, 1), 40],
 [(1, 1, 1), 41],
 [(0, 1, 1), 42],
 [(0, 1, 1), 43],
 [(1, 0, 0), 44],
 [(0, 1, 1), 45],
 [(1, 0, 0), 46],
 [(1, 1, 1), 47],
 [(1, 0, 0), 48],
 [(0, 1, 1), 49],
 [(1, 0, 0), 50],
 [(1, 1, 1), 51],
 [(0, 1, 1), 52],
 [(0, 1, 1), 53],
 [(1, 0, 0), 54],
 [(1, 1, 1), 55],
 [

In [13]:
res = sc.parallelize(indexed_data).groupByKey().map(lambda x: [x[0], list(x[1])])

In [14]:
res.keys().collect()

                                                                                

[(1, 0, 0), (0, 1, 1), (1, 1, 1)]

In [15]:
indexed_data.sort(key=lambda x: x[0])

In [None]:
indexed_data

[[(0, 1, 1), 1],
 [(0, 1, 1), 2],
 [(0, 1, 1), 7],
 [(0, 1, 1), 15],
 [(0, 1, 1), 19],
 [(0, 1, 1), 20],
 [(0, 1, 1), 22],
 [(0, 1, 1), 24],
 [(0, 1, 1), 26],
 [(0, 1, 1), 27],
 [(0, 1, 1), 29],
 [(0, 1, 1), 33],
 [(0, 1, 1), 36],
 [(0, 1, 1), 38],
 [(0, 1, 1), 39],
 [(0, 1, 1), 40],
 [(0, 1, 1), 42],
 [(0, 1, 1), 43],
 [(0, 1, 1), 45],
 [(0, 1, 1), 49],
 [(0, 1, 1), 52],
 [(0, 1, 1), 53],
 [(0, 1, 1), 59],
 [(0, 1, 1), 64],
 [(0, 1, 1), 65],
 [(0, 1, 1), 68],
 [(0, 1, 1), 71],
 [(0, 1, 1), 72],
 [(0, 1, 1), 74],
 [(0, 1, 1), 75],
 [(0, 1, 1), 76],
 [(0, 1, 1), 77],
 [(0, 1, 1), 80],
 [(0, 1, 1), 82],
 [(0, 1, 1), 84],
 [(0, 1, 1), 85],
 [(0, 1, 1), 92],
 [(0, 1, 1), 93],
 [(0, 1, 1), 94],
 [(0, 1, 1), 96],
 [(0, 1, 1), 98],
 [(0, 1, 1), 99],
 [(1, 0, 0), 4],
 [(1, 0, 0), 5],
 [(1, 0, 0), 6],
 [(1, 0, 0), 9],
 [(1, 0, 0), 10],
 [(1, 0, 0), 11],
 [(1, 0, 0), 14],
 [(1, 0, 0), 16],
 [(1, 0, 0), 18],
 [(1, 0, 0), 23],
 [(1, 0, 0), 28],
 [(1, 0, 0), 30],
 [(1, 0, 0), 31],
 [(1, 0, 0), 32],

24/05/17 08:16:50 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 541314 ms exceeds timeout 120000 ms
24/05/17 08:16:50 WARN SparkContext: Killing executors is not supported by current scheduler.
24/05/17 08:16:52 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$