In [5]:
import sys  
sys.path.insert(0, '../..')

In [None]:
%load_ext autoreload
%autoreload 2

In [8]:
import optuna
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs, make_moons
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_mutual_info_score as AMI
from rock import ROCK

import logging
import sys

In [9]:
def objective(trial):
    seed = 0

    results = []

    n_samples = 1000
    jitter = 0.15

    min_pts = trial.suggest_int('min_pts', 2, 100)
    eps = trial.suggest_int('eps', 1, 10) * 0.1

    datasets = []
    n_centers = None
    
    for s in range(seed, seed+10):
        datasets.append(make_moons(n_samples=n_samples, noise=jitter,
            shuffle=True, random_state=s))
        
    step = 0
    for dataset in datasets: 
        run = {}
        run['run_id'] = run_id
        run['step'] = step
        run['n_samples'] = n_samples
        run['jitter'] = jitter
        scaler = StandardScaler()
        data = scaler.fit_transform(dataset[0])

        dbscan = DBSCAN(eps=eps, min_samples=min_pts).fit(data).labels_
        run['eps'] = eps
        run['min_pts'] = min_pts

        gt = dataset[1]
        results.append(AMI(dbscan, gt))
        run['DBSCAN'] = AMI(dbscan, gt)
        step += 1
        experiment.append(run)

    return np.mean(results)

In [10]:
experiment = []
run_id = 0

In [11]:
study_name = 'two_moons_opt'

In [12]:
storage_name = "sqlite:///{}.db".format(study_name)
sampler = optuna.samplers.TPESampler(seed=0)
study = optuna.create_study(study_name=study_name, storage=storage_name, sampler=sampler, load_if_exists=True,  direction="maximize")

[32m[I 2021-05-31 15:54:25,554][0m A new study created in RDB with name: two_moons_opt[0m


In [13]:
study.optimize(objective, n_trials=100)

[32m[I 2021-05-31 15:54:48,652][0m Trial 0 finished with value: 0.0 and parameters: {'min_pts': 56, 'eps': 8}. Best is trial 0 with value: 0.0.[0m
[32m[I 2021-05-31 15:54:48,847][0m Trial 1 finished with value: 0.0001820904250210742 and parameters: {'min_pts': 61, 'eps': 6}. Best is trial 1 with value: 0.0001820904250210742.[0m
[32m[I 2021-05-31 15:54:49,067][0m Trial 2 finished with value: 7.672985006701068e-16 and parameters: {'min_pts': 43, 'eps': 7}. Best is trial 1 with value: 0.0001820904250210742.[0m
[32m[I 2021-05-31 15:54:49,288][0m Trial 3 finished with value: 0.0 and parameters: {'min_pts': 45, 'eps': 9}. Best is trial 1 with value: 0.0001820904250210742.[0m
[32m[I 2021-05-31 15:54:49,466][0m Trial 4 finished with value: 0.0 and parameters: {'min_pts': 97, 'eps': 4}. Best is trial 1 with value: 0.0001820904250210742.[0m
[32m[I 2021-05-31 15:54:49,658][0m Trial 5 finished with value: 0.0009439327152330999 and parameters: {'min_pts': 80, 'eps': 6}. Best is tri

[32m[I 2021-05-31 15:54:57,135][0m Trial 46 finished with value: 0.5842809606119299 and parameters: {'min_pts': 39, 'eps': 4}. Best is trial 14 with value: 0.8223943608040875.[0m
[32m[I 2021-05-31 15:54:57,297][0m Trial 47 finished with value: -0.00041245973435900183 and parameters: {'min_pts': 2, 'eps': 3}. Best is trial 14 with value: 0.8223943608040875.[0m
[32m[I 2021-05-31 15:54:57,478][0m Trial 48 finished with value: 0.2843963826403294 and parameters: {'min_pts': 24, 'eps': 2}. Best is trial 14 with value: 0.8223943608040875.[0m
[32m[I 2021-05-31 15:54:57,629][0m Trial 49 finished with value: 0.0 and parameters: {'min_pts': 100, 'eps': 1}. Best is trial 14 with value: 0.8223943608040875.[0m
[32m[I 2021-05-31 15:54:57,799][0m Trial 50 finished with value: 0.7039805622386408 and parameters: {'min_pts': 35, 'eps': 3}. Best is trial 14 with value: 0.8223943608040875.[0m
[32m[I 2021-05-31 15:54:57,968][0m Trial 51 finished with value: 0.7881525424337006 and parameters

[32m[I 2021-05-31 15:55:05,403][0m Trial 92 finished with value: 0.829386229002961 and parameters: {'min_pts': 42, 'eps': 4}. Best is trial 80 with value: 0.8300476666603241.[0m
[32m[I 2021-05-31 15:55:05,603][0m Trial 93 finished with value: 0.8300476666603241 and parameters: {'min_pts': 41, 'eps': 4}. Best is trial 80 with value: 0.8300476666603241.[0m
[32m[I 2021-05-31 15:55:05,803][0m Trial 94 finished with value: -0.0006429245958157786 and parameters: {'min_pts': 42, 'eps': 5}. Best is trial 80 with value: 0.8300476666603241.[0m
[32m[I 2021-05-31 15:55:05,987][0m Trial 95 finished with value: 0.8300476666603241 and parameters: {'min_pts': 41, 'eps': 4}. Best is trial 80 with value: 0.8300476666603241.[0m
[32m[I 2021-05-31 15:55:06,187][0m Trial 96 finished with value: 0.8298539801615614 and parameters: {'min_pts': 40, 'eps': 4}. Best is trial 80 with value: 0.8300476666603241.[0m
[32m[I 2021-05-31 15:55:06,387][0m Trial 97 finished with value: 0.8300476666603241 a

In [14]:
results = pd.DataFrame(experiment)

In [15]:
results.to_csv('two_moons_opt.csv')

In [17]:
results.iloc[80 * 10 : 81 * 10].std()['DBSCAN']

0.024435737741039746