In [1]:
import numpy as np
import pandas as pd
import numba
import sklearn.datasets
import pynndescent
import hdbscan
import matplotlib.pyplot as plt
import seaborn as sns
import math
import scipy.sparse
import sklearn.metrics
from matplotlib import pyplot
from sklearn.inspection import DecisionBoundaryDisplay
from matplotlib.gridspec import GridSpec
from joblib import Parallel, delayed
from time import time, ctime


from collections import namedtuple
from sklearn.preprocessing import normalize
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedShuffleSplit

sns.set(rc={"figure.figsize":(12, 12)})

# Load HighDimClusterer class

In [2]:
execfile('00-HighDimClusterer.py')

  warn(


In [3]:
mnist = sklearn.datasets.fetch_openml("mnist_784")

  warn(


In [4]:
parameters = {
    'min_cluster_size' : 800,
    'n_neighbors' : 15,
    'max_iter' : 50,
    'label_prior' : 0.99,
    'noise_prior' : 0.01,
    'k' : 1,
    'min_prob' : 1e-4,
    'model_prior_strength' : 0.0,
    'n_iter' : 3
}

In [6]:
clusterer = HighDimClusterer(**parameters)
cluster_labels = clusterer.fit_predict(mnist.data)
sklearn.metrics.adjusted_rand_score(cluster_labels, mnist.target)

0.9013990863532596

## Experiments

We'll run this algorithm (with tuned parameters) and compare it with a combination of UMAP and HDBSCAN (with tuned parameters). This algorithm was developed largely against MNIST, so it is perhaps somewhat overtuned to MNIST. You may want to substitute in your own favourite dataset instead; alhtough it should be "high dimensional" enough the not require Gamma distribution models (above about 10 should probably be enough).

In [13]:
mnist = sklearn.datasets.fetch_openml("mnist_784")

  warn(


## Parallel hyperparam search

In [14]:
def test_harness(param_dict, outer_loop_dict, train_size, replications=5):
    results_list = []
    parameters = param_dict | outer_loop_dict
    ss = StratifiedShuffleSplit(n_splits=replications, train_size=train_size, random_state=42)
    for i, (train_index, test_index) in enumerate(ss.split(X, y)): 
        clusterer = HighDimClusterer(**parameters)
        start = time()
        cluster_labels = clusterer.fit_predict(X[train_index,:])
        results = parameters.copy()
        results['iteration'] = i
        results['train_size'] = train_size
        results['duration'] = time()-start
        results['adj_rand'] = sklearn.metrics.adjusted_rand_score(cluster_labels, y[train_index])
        non_noise_index = cluster_labels!= -1
        results['adj_rand_no_noise'] = sklearn.metrics.adjusted_rand_score(cluster_labels[non_noise_index], y[train_index][non_noise_index])
        results['number_of_clusters'] = len(np.unique(cluster_labels))
        results['proportion_of_noise'] = np.sum(cluster_labels==-1)/len(cluster_labels)
        results_list.append(results)
    return results_list

In [15]:
from sklearn.model_selection import ParameterGrid
from joblib import Parallel, delayed

In [16]:
param_dict = {
    'n_neighbors': [5,15,50],
    #'min_cluster_size': [600, 800, 1000, 1500],
    'min_prob': [1e-16, 1e-8, 1e-4, 1e-2],
    'k': [1,2],
    'model_prior_strength':[0, 0.1, 0.5],
    'n_iter':[2,3, 5],
#    'label_prior': [0.8, 0.9, 0.97],
#    'noise_prior': [0.1, 0.2, 0.3],
#    'max_iter':[50, 100, 150],
#    'tolerance':[1e-5, 5e-4, 1e-4],
}
param_list = ParameterGrid(param_dict)
len(param_list)

216

In [17]:
outer_loop = {
    'min_cluster_size': [600, 800, 1000, 1500],
}
outer_loop_list = ParameterGrid(outer_loop)
len(outer_loop_list)

4

In [18]:
import warnings
with warnings.catch_warnings():
    warnings.filterwarnings('ignore', r'Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.')

In [29]:
%%time
numba.set_num_threads(1)
cores = 8
reps= 5
X = np.array(mnist.data)
y = np.array(mnist.target)
train_sizes=[0.9, 0.75, 0.5] 

results_list = []
for train_size in train_sizes:
    for i,outer_loop_params in enumerate(outer_loop_list):
        total_runs = len(param_list)*reps
        print(f"{ctime()} starting {outer_loop_params} with sample size = {train_size}")
        start_time = time()
        results_list.append(list(np.concatenate(Parallel(n_jobs=cores)(delayed(test_harness)(param,  outer_loop_params, train_size, replications=reps) for param in param_list))))
        result_df = pd.DataFrame(list(np.concatenate(results_list)))
        result_df.to_csv(f"HyperParameterSearch_test_train_size{train_size}_params_{i}.csv", index=False)
        wall_time = time()-start_time
        total_runs = len(param_list)*reps
        print(f'train size = {train_size}, n_runs = {total_runs} done in {wall_time:.3f} seconds on {cores} cores, time per run={(total_runs/wall_time):0.3f}')

Fri Sep  8 20:28:00 2023 starting {'min_cluster_size': 600} with sample size = 0.9


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.9, n_runs = 1080 done in 1911.275 seconds on 8 cores, time per run=0.565
Fri Sep  8 20:59:52 2023 starting {'min_cluster_size': 800} with sample size = 0.9


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.9, n_runs = 1080 done in 1670.704 seconds on 8 cores, time per run=0.646
Fri Sep  8 21:27:42 2023 starting {'min_cluster_size': 1000} with sample size = 0.9


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.9, n_runs = 1080 done in 1749.450 seconds on 8 cores, time per run=0.617
Fri Sep  8 21:56:52 2023 starting {'min_cluster_size': 1500} with sample size = 0.9


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.9, n_runs = 1080 done in 2173.660 seconds on 8 cores, time per run=0.497
Fri Sep  8 22:33:05 2023 starting {'min_cluster_size': 600} with sample size = 0.75


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.75, n_runs = 1080 done in 1722.770 seconds on 8 cores, time per run=0.627
Fri Sep  8 23:01:48 2023 starting {'min_cluster_size': 800} with sample size = 0.75


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.75, n_runs = 1080 done in 1446.287 seconds on 8 cores, time per run=0.747
Fri Sep  8 23:25:54 2023 starting {'min_cluster_size': 1000} with sample size = 0.75


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.75, n_runs = 1080 done in 1362.016 seconds on 8 cores, time per run=0.793
Fri Sep  8 23:48:36 2023 starting {'min_cluster_size': 1500} with sample size = 0.75


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.75, n_runs = 1080 done in 1279.903 seconds on 8 cores, time per run=0.844
Sat Sep  9 00:09:56 2023 starting {'min_cluster_size': 600} with sample size = 0.5


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.5, n_runs = 1080 done in 1718.471 seconds on 8 cores, time per run=0.628
Sat Sep  9 00:38:35 2023 starting {'min_cluster_size': 800} with sample size = 0.5


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.5, n_runs = 1080 done in 1569.752 seconds on 8 cores, time per run=0.688
Sat Sep  9 01:04:45 2023 starting {'min_cluster_size': 1000} with sample size = 0.5


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.5, n_runs = 1080 done in 1557.043 seconds on 8 cores, time per run=0.694
Sat Sep  9 01:30:42 2023 starting {'min_cluster_size': 1500} with sample size = 0.5


Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid state in the child process.
Numba: Attempted to fork from a non-main thread, the TBB library may be in an invalid st

train size = 0.5, n_runs = 1080 done in 1603.953 seconds on 8 cores, time per run=0.673
CPU times: user 1min 11s, sys: 28.6 s, total: 1min 39s
Wall time: 5h 29min 25s


In [110]:
type(wall_time)

float

In [30]:
print(f'train size = {train_size}, n_runs = {total_runs} done in {wall_time:.3f}')

train size = 0.5, n_runs = 1080 done in 1603.953


In [32]:
result_df.shape

(12960, 13)

In [31]:
result_df = pd.DataFrame(list(np.concatenate(results_list)))
result_df.to_csv("HyperParameterSearch_September11_mark2.csv", index=False)

In [33]:
print(f'runs = {len(result_df)} total time = {wall_time} average time = {wall_time/len(result_df)}')

runs = 12960 total time = 1603.9532499313354 average time = 0.12376182484038083


In [37]:
12960/(5.5*3600)

5.236363636363636

In [45]:
result_df.train_size.value_counts()

train_size
0.90    4320
0.75    4320
0.50    4320
Name: count, dtype: int64