In [3]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark import SparkConf
sc = SparkContext.getOrCreate(SparkConf().setMaster("local[*]"))

import numpy as np
import itertools


In [4]:
from sklearn.datasets import make_blobs

# Generate random dataset with 3 dimensions
n_samples = 100  # Number of samples
n_features = 3  # Number of dimensions
centers = 10  # Number of clusters
random_state = 42  # Random state for reproducibility

X, y = make_blobs(n_samples=n_samples, n_features=n_features, centers=centers, random_state=random_state)

In [5]:
X = X.tolist()

# 1. Construct Grid


In [6]:
# set parameters
min_pts = 5
eps = 2
dataset = X
partition_each_dim = [3 for i in range(n_features)]

# b_dataset = sc.broadcast(dataset)
# b_eps = sc.broadcast(eps)
# b_min_pts = sc.broadcast(min_pts)


In [9]:
# get the max/min bounds of each dimension
def get_minmax_by_data(dataset):
    # get the max/min bounds of each dimension
    min_max_bounds = np.concatenate(([np.min(dataset, axis=0)], [np.max(dataset, axis=0)]), axis=0).T   # (d, 2)
    return min_max_bounds

def get_minmax_by_bins(bin_ind, bin_bounds):
    min_max_bounds = []
    for i in range(len(bin_ind)):
        min_max_bounds.append((bin_bounds[i][bin_ind[i]], bin_bounds[i][bin_ind[i]+1]))
    return min_max_bounds

# get the bound of each bin
def get_bin_bounds(min_max_bounds, partition_each_dim):
    # get the bound of each bin
    # bin_bounds: list of np.array, each array contains the bounds of the bins in that dimension
    bin_bounds = []
    for i in range(len(partition_each_dim)):
        Lower_bound = min_max_bounds[i][0]
        Upper_bound = min_max_bounds[i][-1]
        dim_bins = np.linspace(Lower_bound, Upper_bound, partition_each_dim[i]+1, endpoint=True)
        bin_bounds.append(dim_bins)
    return bin_bounds

# locate the point in the partitioned space, return the index of space
def locate_point(x, bin_bounds, partition_each_dim):
    pos_list = []
    for i in range(n_features):
        # pos: the index of the bin in that dimension
        pos = np.digitize(x[i], bin_bounds[i]) - 1
        pos = min(partition_each_dim[i]-1, pos) # if the value is the max value, it should be in the last bin
        pos = max(0, pos) # if the value is the min value, it should be in the first bin
        pos_list.append(pos)
    return tuple(pos_list)


In [10]:
num_par = np.prod(partition_each_dim)
bin_bounds = get_bin_bounds(get_minmax_by_data(dataset), partition_each_dim)

rdd = sc.parallelize(dataset, num_par)
rdd = rdd.map(lambda x: (locate_point(x, bin_bounds, partition_each_dim), x))
rdd = rdd.sortByKey()
# rdd = rdd.groupByKey().map(lambda x: (x[0], list(x[1])))
rdd.glom().take(10)

                                                                                

[[((0, 0, 0), [-8.137053258537247, -4.216632385357437, -5.914030474195238]),
  ((0, 0, 0), [-8.234510428293452, -4.409675180688797, -3.9205463160910154]),
  ((0, 0, 1), [-7.178703520999728, -5.43000236549511, 0.048613680577736185]),
  ((0, 0, 1), [-5.278107750896421, -5.292824508766337, -0.4426964072703655]),
  ((0, 0, 1), [-8.729492752913176, -4.641341102161888, -1.4058519849395432]),
  ((0, 0, 1), [-6.357689452162939, -4.949627767728337, -2.787499575593065]),
  ((0, 0, 1), [-7.544624022800111, -4.632052340456593, -3.3260923666998776])],
 [((0, 1, 0), [-6.705135507978706, -3.2913518351255155, -3.8730595411819424]),
  ((0, 1, 0), [-5.444668546678066, -3.752125318334681, -3.9336470884612114])],
 [((0, 1, 1), [-6.318907911053417, -2.4616210636519287, 0.23047179940680118]),
  ((0, 1, 1), [-5.475511008607851, -3.7010613966790418, -0.7506101460672308]),
  ((0, 1, 1), [-7.917792252577944, -3.7132876011494087, -1.8981290806968292]),
  ((0, 1, 1), [-9.23526537361677, -3.970652714526209, -3.334

In [11]:
# get data with key (0, 0, 0)

test_key = (0, 0, 0)
test_value = rdd.filter(lambda x: x[0] == (0, 0, 0)).map(lambda x: x[1]).collect()
test_value

                                                                                

[[-8.137053258537247, -4.216632385357437, -5.914030474195238],
 [-8.234510428293452, -4.409675180688797, -3.9205463160910154]]

In [12]:
# calculate grid number for each dimension in one partition
def cal_grid_num(min_max_bounds, eps):
    global n_features
    grid_side_len = eps/np.sqrt(n_features)
    gridnum_each_dim = []
    for i in range(n_features):
        gridnum_each_dim.append(int((min_max_bounds[i][1] - min_max_bounds[i][0]) / grid_side_len))
    return gridnum_each_dim

def get_grid_each_partition(pids, bin_bounds, eps):
    grid_bin_bounds = {}
    gridnum_each_dim = {}
    for pid in pids:
        gridnum_each_dim[pid] = cal_grid_num(get_minmax_by_bins(pid, bin_bounds), eps)
        grid_bin_bounds[pid] = get_bin_bounds(get_minmax_by_bins(pid, bin_bounds), gridnum_each_dim[pid])
    return grid_bin_bounds, gridnum_each_dim

pids = rdd.keys().distinct().collect()
grid_bin_bounds, gridnum_each_dim = get_grid_each_partition(pids, bin_bounds, eps)
print(grid_bin_bounds[(0,0,0)])



[array([-10.75709897,  -9.47024932,  -8.18339967,  -6.89655002,
        -5.60970037,  -4.32285072]), array([-10.99016118,  -9.82152524,  -8.65288931,  -7.48425337,
        -6.31561744,  -5.1469815 ,  -3.97834557]), array([-10.28518036,  -9.12816924,  -7.97115811,  -6.81414699,
        -5.65713587,  -4.50012474,  -3.34311362])]


                                                                                

In [13]:
def find_grid(pa):
    l = []
    for pid, value in pa:
        loc = locate_point(value, grid_bin_bounds[pid], gridnum_each_dim[pid])
        l.append((pid, loc, value))
    return l

rdd_with_grid = rdd.mapPartitions(find_grid)
rdd_with_grid.collect()

[((0, 0, 0),
  (2, 5, 3),
  [-8.137053258537247, -4.216632385357437, -5.914030474195238]),
 ((0, 0, 0),
  (1, 5, 5),
  [-8.234510428293452, -4.409675180688797, -3.9205463160910154]),
 ((0, 0, 1),
  (2, 4, 2),
  [-7.178703520999728, -5.43000236549511, 0.048613680577736185]),
 ((0, 0, 1),
  (4, 4, 2),
  [-5.278107750896421, -5.292824508766337, -0.4426964072703655]),
 ((0, 0, 1),
  (1, 5, 1),
  [-8.729492752913176, -4.641341102161888, -1.4058519849395432]),
 ((0, 0, 1),
  (3, 5, 0),
  [-6.357689452162939, -4.949627767728337, -2.787499575593065]),
 ((0, 0, 1),
  (2, 5, 0),
  [-7.544624022800111, -4.632052340456593, -3.3260923666998776]),
 ((0, 1, 0),
  (3, 0, 5),
  [-6.705135507978706, -3.2913518351255155, -3.8730595411819424]),
 ((0, 1, 0),
  (4, 0, 5),
  [-5.444668546678066, -3.752125318334681, -3.9336470884612114]),
 ((0, 1, 1),
  (3, 1, 3),
  [-6.318907911053417, -2.4616210636519287, 0.23047179940680118]),
 ((0, 1, 1),
  (4, 0, 2),
  [-5.475511008607851, -3.7010613966790418, -0.7506101

# 2. Identify Core Point and Core cell

In [40]:
# find core points in each partition
def find_core_points(pa, eps, min_pts):
    for pid, gid, value in pa:
        count = 0
        for _, _, value2 in pa:
            if np.linalg.norm(np.array(value) - np.array(value2)) <= eps:
                count += 1
            if count >= min_pts:
                yield (pid, gid, value, 1)
                break
        yield (pid, gid, value, 0)

core_points = rdd_with_grid.mapPartitions(lambda x: find_core_points(x, eps, min_pts))
core_points.collect()

[((0, 0, 0),
  (2, 5, 3),
  [-8.137053258537247, -4.216632385357437, -5.914030474195238],
  0),
 ((0, 0, 0),
  (1, 5, 5),
  [-8.234510428293452, -4.409675180688797, -3.9205463160910154],
  0),
 ((0, 0, 1),
  (2, 4, 2),
  [-7.178703520999728, -5.43000236549511, 0.048613680577736185],
  0),
 ((0, 0, 1),
  (4, 4, 2),
  [-5.278107750896421, -5.292824508766337, -0.4426964072703655],
  0),
 ((0, 0, 1),
  (1, 5, 1),
  [-8.729492752913176, -4.641341102161888, -1.4058519849395432],
  0),
 ((0, 0, 1),
  (3, 5, 0),
  [-6.357689452162939, -4.949627767728337, -2.787499575593065],
  0),
 ((0, 0, 1),
  (2, 5, 0),
  [-7.544624022800111, -4.632052340456593, -3.3260923666998776],
  0),
 ((0, 1, 0),
  (3, 0, 5),
  [-6.705135507978706, -3.2913518351255155, -3.8730595411819424],
  0),
 ((0, 1, 0),
  (4, 0, 5),
  [-5.444668546678066, -3.752125318334681, -3.9336470884612114],
  0),
 ((0, 1, 1),
  (3, 1, 3),
  [-6.318907911053417, -2.4616210636519287, 0.23047179940680118],
  1),
 ((0, 1, 1),
  (3, 1, 3),
  [-

In [46]:
# get the core cells

core_cells = core_points.map(lambda x: ((x[0], x[1]), x[3])).reduceByKey(lambda x, y: x+y).filter(lambda x: x[1] > 0)
core_cells.collect()

                                                                                

[(((0, 1, 1), (4, 0, 2)), 1),
 (((0, 2, 1), (1, 3, 3)), 2),
 (((1, 2, 2), (1, 5, 0)), 1),
 (((2, 0, 0), (2, 3, 3)), 1),
 (((2, 0, 2), (2, 1, 4)), 1),
 (((1, 1, 0), (4, 4, 0)), 2),
 (((1, 2, 2), (0, 4, 0)), 1),
 (((1, 2, 2), (0, 5, 0)), 1),
 (((2, 0, 0), (2, 4, 3)), 1),
 (((0, 1, 1), (2, 0, 3)), 1),
 (((0, 1, 1), (3, 0, 2)), 2),
 (((1, 2, 0), (3, 2, 2)), 1),
 (((1, 2, 2), (1, 4, 0)), 1),
 (((2, 0, 2), (1, 2, 3)), 1),
 (((1, 2, 2), (0, 5, 1)), 1),
 (((1, 1, 0), (4, 5, 1)), 4),
 (((2, 0, 0), (3, 4, 4)), 1),
 (((0, 1, 1), (3, 0, 3)), 1),
 (((2, 0, 0), (1, 3, 4)), 1),
 (((2, 0, 0), (3, 4, 3)), 1),
 (((0, 2, 1), (1, 3, 4)), 1),
 (((0, 1, 1), (3, 1, 3)), 1),
 (((0, 2, 1), (2, 3, 4)), 1),
 (((1, 2, 0), (3, 3, 2)), 1),
 (((1, 1, 0), (3, 5, 0)), 2),
 (((2, 0, 2), (1, 1, 3)), 1),
 (((1, 1, 1), (1, 0, 4)), 1),
 (((1, 2, 0), (3, 2, 3)), 3),
 (((1, 2, 0), (3, 1, 3)), 1),
 (((1, 2, 2), (1, 4, 1)), 1),
 (((2, 0, 0), (4, 5, 2)), 1),
 (((0, 1, 1), (3, 0, 4)), 1),
 (((1, 2, 0), (2, 1, 3)), 2),
 (((2, 0, 

# 3. Find eps-neighbor cell pair

# 4. Create Graph

# 5. Compute Connect Component

# 6. Assign the border points to clusters

In [5]:
# set parameters
min_pts = 18
eps = 2
dataset = X
b_dataset = sc.broadcast(dataset)
b_eps = sc.broadcast(eps)
b_min_pts = sc.broadcast(min_pts)
partition_each_dim = (2, 2, 2)


In [6]:
# partition data points
partitioned_rdd = spatial_partition(X, partition_each_dim, eps)


spatial_partition time cost: 413.2580757141113ms


In [7]:
# in each partition, create grid cells

# calculate grid number for each dimension in one partition
def cal_grid_num(data, eps):
    grid_side_len = eps/np.sqrt(data.shape[1])
    gridnum_each_dim = [int((np.max(data[:, i]) - np.min(data[:, i])) / grid_side_len) for i in range(data.shape[1])]
    return gridnum_each_dim

# test for one partition
p_ind = partitioned_rdd.take(1)[0][1]
p_data = X[p_ind]
gridnum_each_dim = cal_grid_num(p_data, eps)
grid_rdd = spatial_partition(p_data, gridnum_each_dim, eps) # key: grid index, value: data points in this grid

                                                                                

spatial_partition time cost: 124.79901313781738ms


In [8]:
# find core points in each partition

dist_m_par1 = _get_distance_matrix(p_data)
neighbors = [_get_neighbors(i, dist_m_par1, eps) for i in range(p_data.shape[0])]
is_corep = [1 if (len(p) >= min_pts) else 0 for p in neighbors]


In [11]:
# find core grid cells in each partition
def find_core_grid(grid_rdd, eps, min_pts):
    core_grid = []
    for grid, data_ind in grid_rdd.collect():
        data = X[data_ind]
        dist_m = _get_distance_matrix(data)
        neighbors = [_get_neighbors(i, dist_m, eps) for i in range(data.shape[0])]
        is_core = [1 if (len(p) >= min_pts) else 0 for p in neighbors]
        if sum(is_core) > 0:
            core_grid.append(grid)
    return core_grid

core_grid = find_core_grid(grid_rdd, eps, min_pts)

                                                                                

In [13]:
core_grid = []
for grid, data_ind in grid_rdd.collect():
    print(grid, data_ind)
    data = X[data_ind]
    dist_m = _get_distance_matrix(data)
    neighbors = [_get_neighbors(i, dist_m, eps) for i in range(data.shape[0])]
    is_core = [1 if (len(p) >= min_pts) else 0 for p in neighbors]
    if sum(is_core) > 0:
        core_grid.append(grid)


0 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32]
8 [1, 4, 5, 7, 8, 10, 11, 12, 14, 15, 17, 19, 20, 21, 23, 24, 25, 26, 29, 32]
16 [1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 30, 31, 32]
1 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32]
9 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32]
17 [1, 4, 5, 7, 8, 10, 11, 12, 14, 15, 17, 19, 20, 21, 23, 24, 25, 26, 29, 32]
2 [0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 14, 15, 17, 19, 20, 21, 24, 25, 26, 27, 29, 32]
10 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 24, 25, 26, 27, 28, 29, 30, 31, 32]
3 [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32]
11 [0, 1, 2, 4, 5, 7, 8, 10, 11, 12, 14, 15, 

                                                                                

In [7]:
@staticmethod
def local_dbscan(partioned_rdd, method='matrix', metric='euclidian'):

    dataset = np.array([b_dataset.value[idp] for idp in partioned_rdd])
    if method == 'matrix':
        dbscan_obj = MatrixDBSCAN(dataset, b_eps.value, b_min_pts.value, metric) 
    else:
        dbscan_obj = NaiveDBSCAN(dataset, b_eps.value, b_min_pts.value, metric) 
    dbscan_obj.predict()
    is_core_list = dbscan_obj._find_core_pts()
    
    return list(zip(partioned_rdd, is_core_list, dbscan_obj.tags))


local_tags = rdd.mapValues(lambda x: local_dbscan(x, method='matrix', metric='euclidian')).collect()

predict time cost: 0.9989738464355469ms                             (0 + 8) / 8]
predict time cost: 0.16808509826660156ms
predict time cost: 2.4399757385253906ms
predict time cost: 11.595964431762695ms
predict time cost: 14.004945755004883ms
predict time cost: 3.5741329193115234ms
predict time cost: 0.10275840759277344ms
predict time cost: 8.532047271728516ms
predict time cost: 0.09703636169433594ms
predict time cost: 65.52505493164062ms>                             (4 + 4) / 8]
predict time cost: 34.03902053833008ms
predict time cost: 0.8440017700195312ms
predict time cost: 108.0329418182373ms
predict time cost: 109.83109474182129ms
predict time cost: 0.06985664367675781ms
predict time cost: 0.19621849060058594ms
predict time cost: 92.9727554321289ms
predict time cost: 11.018037796020508ms
predict time cost: 5.685091018676758ms
predict time cost: 33.27608108520508ms
                                                                                

In [8]:
local_tags

[(24,
  [(0, 0, 2),
   (1, 1, 1),
   (2, 1, 1),
   (7, 0, 1),
   (15, 0, 2),
   (19, 1, 1),
   (20, 1, 1),
   (22, 1, 2),
   (24, 1, 1),
   (26, 1, 1),
   (27, 0, 2),
   (29, 1, 1),
   (33, 1, 2),
   (34, 0, 2),
   (35, 0, 2),
   (36, 1, 1),
   (38, 1, 1),
   (39, 1, 1),
   (40, 0, 2),
   (42, 1, 1),
   (43, 1, 1),
   (45, 0, 1),
   (47, 0, 2),
   (49, 1, 1),
   (52, 1, 1),
   (53, 0, 2),
   (55, 0, 2),
   (58, 0, 2),
   (59, 1, 1),
   (61, 0, 2),
   (63, 1, 2),
   (64, 1, 2),
   (65, 1, 1),
   (68, 1, 1),
   (71, 1, 1),
   (72, 0, 2),
   (74, 0, 1),
   (75, 0, 2),
   (76, 1, 1),
   (77, 1, 1),
   (80, 1, 1),
   (82, 1, 1),
   (83, 1, 2),
   (84, 1, 1),
   (85, 1, 1),
   (92, 1, 1),
   (93, 1, 1),
   (94, 0, 1),
   (96, 0, 2),
   (97, 0, 2),
   (98, 0, 1),
   (99, 1, 1)]),
 (16,
  [(2, 0, -2),
   (3, 1, 1),
   (8, 1, 1),
   (13, 1, 1),
   (15, 1, 1),
   (17, 1, 1),
   (19, 0, -2),
   (21, 1, 1),
   (22, 1, 1),
   (25, 1, 1),
   (27, 1, 1),
   (33, 1, 1),
   (34, 1, 1),
   (35, 1, 1),
 

In [9]:
@timeit
def merge(local_tags, dataset):
    global_tags = [UNKNOWN] * len(dataset)
    is_tagged = [0] * len(dataset)
    last_max_label = 0
    for local in local_tags:
        np_local = np.array(local[-1])
        np_local[:, -1] += last_max_label

        last_max_label = np.max(np_local[:, -1])
        
        # check and merge overlapped points
        tagged_indices = np.nonzero(is_tagged)[0]
        for tmp_i in range(len(np_local)):
            # should do tag check
            p_id, is_core, label = np_local[tmp_i]
            if p_id in tagged_indices and is_core==1:
                np_local[-1][np_local[-1]==label] = global_tags[p_id]
        
        # update global tags
        for p_id, is_core, label in np_local:
            if is_tagged[p_id]==1:
                continue
            global_tags[p_id] = label
            is_tagged[p_id] = 1
    return global_tags

result_tags = merge(local_tags, dataset)


merge time cost: 7.142066955566406ms


In [10]:
len(result_tags)

100

In [11]:
rdd.collect()

[[24,
  [0,
   1,
   2,
   7,
   15,
   19,
   20,
   22,
   24,
   26,
   27,
   29,
   33,
   34,
   35,
   36,
   38,
   39,
   40,
   42,
   43,
   45,
   47,
   49,
   52,
   53,
   55,
   58,
   59,
   61,
   63,
   64,
   65,
   68,
   71,
   72,
   74,
   75,
   76,
   77,
   80,
   82,
   83,
   84,
   85,
   92,
   93,
   94,
   96,
   97,
   98,
   99]],
 [16,
  [2,
   3,
   8,
   13,
   15,
   17,
   19,
   21,
   22,
   25,
   27,
   33,
   34,
   35,
   36,
   38,
   40,
   45,
   47,
   49,
   52,
   53,
   55,
   56,
   63,
   64,
   68,
   71,
   72,
   75,
   76,
   78,
   80,
   81,
   82,
   83,
   84,
   85,
   87,
   89,
   91,
   93,
   94,
   96,
   97]],
 [25,
  [0,
   2,
   3,
   8,
   12,
   13,
   15,
   17,
   19,
   21,
   22,
   25,
   27,
   33,
   34,
   35,
   36,
   38,
   40,
   41,
   45,
   47,
   49,
   51,
   52,
   53,
   55,
   56,
   58,
   61,
   63,
   64,
   68,
   71,
   72,
   75,
   76,
   78,
   80,
   81,
   82,
   83,
   84,
   85,
  

In [16]:
from sklearn.cluster import DBSCAN

# Create an instance of the DBSCAN algorithm
dbscan = DBSCAN(eps=eps, min_samples=min_pts, metric='euclidean')

# Fit the DBSCAN algorithm to the data
labels = dbscan.fit_predict(X)


In [18]:
from sklearn.metrics import adjusted_mutual_info_score, adjusted_rand_score


# Calculate the Mutual Information (MI) score
mi_score = adjusted_mutual_info_score(result_tags, labels)

# Calculate the Adjusted Rand Index (ARI)
ari_score = adjusted_rand_score(result_tags, labels)

print("Mutual Information (MI) score:", mi_score)
print("Adjusted Rand Index (ARI):", ari_score)


Mutual Information (MI) score: 0.8014438842685137
Adjusted Rand Index (ARI): 0.7146010924553108


In [19]:
labels

array([0, 1, 1, 0, 2, 2, 2, 1, 0, 2, 2, 2, 0, 0, 2, 0, 2, 0, 2, 1, 1, 0,
       0, 2, 1, 0, 1, 0, 2, 1, 2, 2, 2, 0, 0, 0, 1, 2, 1, 1, 0, 0, 1, 1,
       2, 1, 2, 0, 2, 1, 2, 0, 1, 0, 2, 0, 0, 2, 0, 1, 2, 0, 2, 0, 0, 1,
       2, 2, 1, 2, 2, 1, 0, 2, 1, 0, 1, 1, 0, 2, 1, 0, 1, 0, 1, 1, 2, 0,
       2, 0, 2, 0, 1, 1, 1, 2, 0, 0, 1, 1])

In [21]:
np.array(result_tags)

array([2, 1, 1, 3, 6, 8, 6, 1, 3, 6, 6, 6, 4, 3, 6, 2, 6, 3, 8, 1, 1, 3,
       2, 6, 1, 3, 1, 2, 8, 1, 8, 6, 6, 2, 2, 2, 1, 6, 1, 1, 2, 4, 1, 1,
       8, 1, 6, 2, 6, 1, 6, 4, 1, 2, 8, 2, 3, 6, 2, 1, 6, 2, 6, 2, 2, 1,
       8, 8, 1, 6, 8, 1, 2, 8, 1, 2, 1, 1, 3, 6, 1, 3, 1, 2, 1, 1, 6, 3,
       6, 3, 6, 3, 1, 1, 1, 6, 2, 2, 1, 1])