In [1]:
import sys  
sys.path.insert(0, '../..')

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import optuna
from optuna.samplers import TPESampler
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs, make_moons
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_mutual_info_score as AMI
from rock import ROCK

import logging
import sys

In [4]:
def getBlobDensities(n_centers):
  return np.round([3/n_centers * x for x in range(1, n_centers + 1)], decimals=2)

In [5]:
def objective(trial):
    seed = 0

    rock_results = []
    other_results = []

    n_samples = trial.suggest_int('n_samples', 1, 16) * 100
    n_features = trial.suggest_int('n_features', 2, 20)
    n_centers = trial.suggest_int('n_centers', 2, 10)

    datasets = []

    for s in range(seed, seed+10):
        datasets.append(make_blobs(n_samples=n_samples, centers=n_centers, n_features=n_features, cluster_std=getBlobDensities(n_centers), random_state=s))
        
    step = 0
    for dataset in datasets: 
        run = {}
        run['run_id'] = run_id
        run['step'] = step
        run['n_samples'] = n_samples
        run['n_centers'] = n_centers
        run['n_features'] = n_features
        scaler = StandardScaler()
        data = scaler.fit_transform(dataset[0])
        kmeans = KMeans(n_clusters=n_centers, random_state=seed).fit(data).labels_
        run['k'] = n_centers
        eps, min_pts = (0.2, 2 * n_features)
        dbscan = DBSCAN(eps=eps, min_samples=min_pts).fit(data).labels_
        run['eps'] = eps
        run['min_pts'] = 2 * n_features
        spectral = SpectralClustering(n_clusters=n_centers, random_state=seed).fit(data).labels_
        bandwidth = estimate_bandwidth(data)
        run['bandwidth'] = bandwidth
        mean_shift = MeanShift(bandwidth=bandwidth).fit(data).labels_

        rock = ROCK(tmax=15).fit(data).labels_

        gt = dataset[1]
        rock_results.append(AMI(rock, gt))
        other_results.append(np.max([AMI(kmeans, gt), AMI(dbscan, gt), AMI(spectral, gt), AMI(mean_shift, gt)]))

        run['ROCK'] = AMI(rock, gt)
        run['K_MEANS'] = AMI(kmeans, gt)
        run['DBSCAN'] = AMI(dbscan, gt)
        run['SPECTRAL'] = AMI(spectral, gt)
        run['MEAN_SHIFT'] = AMI(mean_shift, gt)

        step += 1

        experiment.append(run)

    return np.mean(rock_results) - np.mean(other_results)

In [6]:
experiment = []
run_id = 0

In [7]:
study_name = 'den_blobs'

In [8]:
storage_name = f'sqlite:///../../results/optimization/{study_name}.db'
sampler = TPESampler(seed=42)
study = optuna.create_study(study_name=study_name, storage=storage_name, sampler=sampler, load_if_exists=True,  direction="maximize")

[32m[I 2021-05-30 19:04:22,442][0m A new study created in RDB with name: den_blobs[0m


In [9]:
study.optimize(objective, n_trials=100)

[32m[I 2021-05-30 19:05:55,631][0m Trial 0 finished with value: -0.6082313190701353 and parameters: {'n_samples': 6, 'n_features': 20, 'n_centers': 8}. Best is trial 0 with value: -0.6082313190701353.[0m
[32m[I 2021-05-30 19:06:51,797][0m Trial 1 finished with value: -0.03773364939895252 and parameters: {'n_samples': 10, 'n_features': 4, 'n_centers': 3}. Best is trial 1 with value: -0.03773364939895252.[0m
[32m[I 2021-05-30 19:06:58,664][0m Trial 2 finished with value: -0.8526323451498112 and parameters: {'n_samples': 1, 'n_features': 18, 'n_centers': 7}. Best is trial 1 with value: -0.03773364939895252.[0m
[32m[I 2021-05-30 19:08:33,000][0m Trial 3 finished with value: -0.2194917520058074 and parameters: {'n_samples': 12, 'n_features': 2, 'n_centers': 10}. Best is trial 1 with value: -0.03773364939895252.[0m
[32m[I 2021-05-30 19:09:44,174][0m Trial 4 finished with value: -0.16328197707720904 and parameters: {'n_samples': 14, 'n_features': 6, 'n_centers': 3}. Best is tria

[32m[I 2021-05-30 20:18:25,229][0m Trial 39 finished with value: -0.2601294109595216 and parameters: {'n_samples': 5, 'n_features': 8, 'n_centers': 3}. Best is trial 37 with value: 0.0017796584262947945.[0m
[32m[I 2021-05-30 20:19:31,135][0m Trial 40 finished with value: -0.3380548439321752 and parameters: {'n_samples': 12, 'n_features': 6, 'n_centers': 7}. Best is trial 37 with value: 0.0017796584262947945.[0m
[32m[I 2021-05-30 20:20:45,384][0m Trial 41 finished with value: 0.0009214663206327245 and parameters: {'n_samples': 14, 'n_features': 10, 'n_centers': 2}. Best is trial 37 with value: 0.0017796584262947945.[0m
[32m[I 2021-05-30 20:21:50,254][0m Trial 42 finished with value: 0.0009214663206327245 and parameters: {'n_samples': 14, 'n_features': 10, 'n_centers': 2}. Best is trial 37 with value: 0.0017796584262947945.[0m
[32m[I 2021-05-30 20:22:35,367][0m Trial 43 finished with value: -0.2620591450294322 and parameters: {'n_samples': 14, 'n_features': 9, 'n_centers': 

[32m[I 2021-05-30 20:39:35,216][0m Trial 79 finished with value: -0.2653784512690106 and parameters: {'n_samples': 4, 'n_features': 3, 'n_centers': 4}. Best is trial 73 with value: 0.033407548327088366.[0m
[32m[I 2021-05-30 20:39:41,405][0m Trial 80 finished with value: 0.022570992101012877 and parameters: {'n_samples': 2, 'n_features': 2, 'n_centers': 3}. Best is trial 73 with value: 0.033407548327088366.[0m
[32m[I 2021-05-30 20:39:47,769][0m Trial 81 finished with value: 0.022570992101012877 and parameters: {'n_samples': 2, 'n_features': 2, 'n_centers': 3}. Best is trial 73 with value: 0.033407548327088366.[0m
[32m[I 2021-05-30 20:39:59,660][0m Trial 82 finished with value: -0.0312123413451616 and parameters: {'n_samples': 4, 'n_features': 3, 'n_centers': 3}. Best is trial 73 with value: 0.033407548327088366.[0m
[32m[I 2021-05-30 20:40:06,404][0m Trial 83 finished with value: -0.5126161090141399 and parameters: {'n_samples': 2, 'n_features': 4, 'n_centers': 4}. Best is 

In [10]:
pd.DataFrame(experiment).to_csv('../../results/optimization/den_blobs.csv')