In [1]:
import sys  
sys.path.insert(0, '/content/drive/MyDrive/overoptimism')

In [2]:
!pip install optuna

Collecting optuna
[?25l  Downloading https://files.pythonhosted.org/packages/2b/21/d13081805e1e1afc71f5bb743ece324c8bd576237c51b899ecb38a717502/optuna-2.7.0-py3-none-any.whl (293kB)
[K     |█▏                              | 10kB 14.1MB/s eta 0:00:01[K     |██▎                             | 20kB 18.2MB/s eta 0:00:01[K     |███▍                            | 30kB 10.6MB/s eta 0:00:01[K     |████▌                           | 40kB 9.3MB/s eta 0:00:01[K     |█████▋                          | 51kB 5.2MB/s eta 0:00:01[K     |██████▊                         | 61kB 5.8MB/s eta 0:00:01[K     |███████▉                        | 71kB 6.1MB/s eta 0:00:01[K     |█████████                       | 81kB 6.4MB/s eta 0:00:01[K     |██████████                      | 92kB 6.5MB/s eta 0:00:01[K     |███████████▏                    | 102kB 5.0MB/s eta 0:00:01[K     |████████████▎                   | 112kB 5.0MB/s eta 0:00:01[K     |█████████████▍                  | 122kB 5.0MB/s eta 0:

In [19]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [20]:
import optuna
from optuna.samplers import TPESampler
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs, make_moons
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_mutual_info_score as AMI
from rock import ROCK

from util import dbscan_init

import logging
import sys

In [21]:
def objective(trial):
    seed = 0

    rock_results = []
    other_results = []

    n_samples = trial.suggest_int('n_samples', 1, 16) * 100
    jitter = trial.suggest_int('jitter', 1, 20) * 0.01


    datasets = []
    n_centers = None
    
    for s in range(seed, seed+10):
        datasets.append(make_moons(n_samples=n_samples, noise=jitter,
            shuffle=True, random_state=s))
        
    step = 0
    for dataset in datasets: 
        run = {}
        run['run_id'] = run_id
        run['step'] = step
        run['n_samples'] = n_samples
        run['jitter'] = jitter
        scaler = StandardScaler()
        data = scaler.fit_transform(dataset[0])
        kmeans = KMeans(n_clusters=2, random_state=seed).fit(data).labels_
        run['k'] = 2
        eps, min_pts = dbscan_init(data)
        eps, min_pts = (0.2, 4)
        dbscan = DBSCAN(eps=eps, min_samples=min_pts).fit(data).labels_
        run['eps'] = eps
        run['min_pts'] = min_pts
        spectral = SpectralClustering(n_clusters=2).fit(data).labels_
        bandwidth = estimate_bandwidth(data)
        run['bandwidth'] = bandwidth
        mean_shift = MeanShift(bandwidth=bandwidth).fit(data).labels_

        rock = ROCK(tmax=15).fit(data).labels_

        gt = dataset[1]
        rock_results.append(AMI(rock, gt))
        other_results.append(np.max([AMI(kmeans, gt), AMI(dbscan, gt), AMI(spectral, gt), AMI(mean_shift, gt)]))

        run['ROCK'] = AMI(rock, gt)
        run['K_MEANS'] = AMI(kmeans, gt)
        run['DBSCAN'] = AMI(dbscan, gt)
        run['SPECTRAL'] = AMI(spectral, gt)
        run['MEAN_SHIFT'] = AMI(mean_shift, gt)

        step += 1

        experiment.append(run)

    return np.mean(rock_results) - np.mean(other_results)

In [32]:
experiment = []
run_id = 0

In [31]:
study_name = 'two_moons'

In [30]:
storage_name = "sqlite:///{}.db".format(study_name)
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name=study_name, storage=storage_name, sampler=sampler, load_if_exists=True,  direction="maximize")

[32m[I 2021-05-15 13:42:32,067][0m A new study created in RDB with name: two_moons[0m


In [27]:
study.optimize(objective, n_trials=100)

[32m[I 2021-05-15 11:03:06,189][0m Trial 0 finished with value: 0.17165110355273971 and parameters: {'n_samples': 6, 'jitter': 20}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-05-15 11:04:46,428][0m Trial 1 finished with value: 0.17069982531873318 and parameters: {'n_samples': 12, 'jitter': 12}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-05-15 11:05:17,829][0m Trial 2 finished with value: -0.1775758381667769 and parameters: {'n_samples': 3, 'jitter': 4}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-05-15 11:05:41,551][0m Trial 3 finished with value: 0.10051999218148261 and parameters: {'n_samples': 1, 'jitter': 18}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-05-15 11:07:10,314][0m Trial 4 finished with value: 0.3580777415846453 and parameters: {'n_samples': 10, 'jitter': 15}. Best is trial 4 with value: 0.3580777415846453.[0m
[32m[I 2021-05-15 11:07:34,089][0m Trial 5 finished with value: 

In [33]:
study.optimize(objective, n_trials=100)

[32m[I 2021-05-15 13:43:38,923][0m Trial 0 finished with value: 0.17165110355273971 and parameters: {'n_samples': 6, 'jitter': 20}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-05-15 13:45:19,665][0m Trial 1 finished with value: 0.17069982531873318 and parameters: {'n_samples': 12, 'jitter': 12}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-05-15 13:45:50,983][0m Trial 2 finished with value: -0.1775758381667769 and parameters: {'n_samples': 3, 'jitter': 4}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-05-15 13:46:14,766][0m Trial 3 finished with value: 0.10051999218148261 and parameters: {'n_samples': 1, 'jitter': 18}. Best is trial 0 with value: 0.17165110355273971.[0m
[32m[I 2021-05-15 13:47:43,818][0m Trial 4 finished with value: 0.3580777415846453 and parameters: {'n_samples': 10, 'jitter': 15}. Best is trial 4 with value: 0.3580777415846453.[0m
[32m[I 2021-05-15 13:48:07,560][0m Trial 5 finished with value: 

KeyboardInterrupt: ignored

In [28]:
results = pd.DataFrame(experiment)

In [29]:
results.to_csv('two_moons.csv')