In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
import sys  
sys.path.insert(0, '/content/drive/MyDrive/overoptimism')

In [13]:
!pip install optuna



In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
import optuna
from optuna.samplers import TPESampler
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs, make_moons, make_circles
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_mutual_info_score as AMI
from rock import ROCK

import logging
import sys

In [16]:
experiment = []
run_id = 0

In [17]:
def objective(trial):
    seed = 0

    rock_results = []
    other_results = []

    n_samples = trial.suggest_int('n_samples', 1, 16) * 100
    jitter = trial.suggest_int('jitter', 1, 20) * 0.01
    factor = trial.suggest_int('factor', 1, 9) * 0.1

    datasets = []

    for s in range(seed, seed+10):
        datasets.append(make_circles(n_samples=n_samples,
                                     noise=jitter, 
                                     random_state=s))
        
    step = 0
    for dataset in datasets: 
        run = {}
        run['run_id'] = run_id
        run['step'] = step
        run['n_samples'] = n_samples
        run['jitter'] = jitter
        run['factor'] = factor
        scaler = StandardScaler()
        data = scaler.fit_transform(dataset[0])
        kmeans = KMeans(n_clusters=2, random_state=seed).fit(data).labels_
        run['k'] = 2
        eps, min_pts = (0.2, 4)
        dbscan = DBSCAN(eps=eps, min_samples=min_pts).fit(data).labels_
        run['eps'] = eps
        run['min_pts'] = min_pts
        spectral = SpectralClustering(n_clusters=2).fit(data).labels_
        bandwidth = estimate_bandwidth(data)
        run['bandwidth'] = bandwidth
        mean_shift = MeanShift(bandwidth=bandwidth).fit(data).labels_

        rock = ROCK(tmax=15).fit(data).labels_

        gt = dataset[1]
        rock_results.append(AMI(rock, gt))
        other_results.append(np.max([AMI(kmeans, gt), AMI(dbscan, gt), AMI(spectral, gt), AMI(mean_shift, gt)]))

        run['ROCK'] = AMI(rock, gt)
        run['K_MEANS'] = AMI(kmeans, gt)
        run['DBSCAN'] = AMI(dbscan, gt)
        run['SPECTRAL'] = AMI(spectral, gt)
        run['MEAN_SHIFT'] = AMI(mean_shift, gt)

        step += 1

        experiment.append(run)

    return np.mean(rock_results) - np.mean(other_results)

In [18]:
study_name = 'rings'

In [19]:
storage_name = "sqlite:///{}.db".format(study_name)
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name=study_name, storage=storage_name, sampler=sampler, load_if_exists=True,  direction="maximize")

[32m[I 2021-05-15 11:21:24,668][0m A new study created in RDB with name: rings[0m


In [20]:
study.optimize(objective, n_trials=100)

[32m[I 2021-05-15 11:22:17,697][0m Trial 0 finished with value: -0.021426565842398167 and parameters: {'n_samples': 6, 'jitter': 20, 'factor': 7}. Best is trial 0 with value: -0.021426565842398167.[0m
[32m[I 2021-05-15 11:23:06,179][0m Trial 1 finished with value: 0.015410564858046288 and parameters: {'n_samples': 10, 'jitter': 4, 'factor': 2}. Best is trial 1 with value: 0.015410564858046288.[0m
[32m[I 2021-05-15 11:23:26,039][0m Trial 2 finished with value: -0.00574496923838384 and parameters: {'n_samples': 1, 'jitter': 18, 'factor': 6}. Best is trial 1 with value: 0.015410564858046288.[0m
[32m[I 2021-05-15 11:24:08,467][0m Trial 3 finished with value: -0.8263173157086038 and parameters: {'n_samples': 12, 'jitter': 1, 'factor': 9}. Best is trial 1 with value: 0.015410564858046288.[0m
[32m[I 2021-05-15 11:25:25,024][0m Trial 4 finished with value: 0.006960351575107408 and parameters: {'n_samples': 14, 'jitter': 5, 'factor': 2}. Best is trial 1 with value: 0.0154105648580

In [21]:
pd.DataFrame(experiment).to_csv('rings.csv')