In [35]:
import optuna
import numpy as np
import pandas as pd

from sklearn.datasets import make_blobs, make_moons
from sklearn.cluster import KMeans, DBSCAN, SpectralClustering, MeanShift, estimate_bandwidth
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import adjusted_mutual_info_score as AMI
from rock import ROCK

from util import dbscan_init

import logging
import sys

In [36]:
seed = 0
n_samples = 1000
jitter = 15 * 0.01
datasets = []

jitter_grid = [1, 5, 10, 15, 20, 25, 30]
n_samples_grid = [50, 100, 200, 400, 800, 1600]

In [37]:
experiment = []
n_centers = 2
rock_results = []
other_results = []

In [38]:
for j in jitter_grid:
    step = 0
    for s in range(seed, seed+10):
        #print(f'jitter: {j}, seed: {s}')
        dataset = make_moons(n_samples=n_samples, noise=j * 0.01, shuffle=True, random_state=s) 
        run = {}
        run['step'] = step
        run['n_samples'] = n_samples
        run['n_centers'] = n_centers
        run['jitter'] = j * 0.01
        scaler = StandardScaler()
        data = scaler.fit_transform(dataset[0])
        if n_centers:
            kmeans = KMeans(n_clusters=n_centers, random_state=seed).fit(data).labels_
        elif dataset_name == 'moons':
            kmeans = KMeans(n_clusters=2, random_state=seed).fit(data).labels_
            run['k'] = 2
        eps, min_pts = dbscan_init(data)
        eps, min_pts = (0.2, 4)
        dbscan = DBSCAN(eps=eps, min_samples=min_pts).fit(data).labels_
        run['eps'] = eps
        run['min_pts'] = min_pts
        spectral = SpectralClustering(n_clusters=2).fit(data).labels_
        bandwidth = estimate_bandwidth(data)

        run['bandwidth'] = bandwidth
        mean_shift = MeanShift(bandwidth=bandwidth).fit(data).labels_

        rock = ROCK(tmax=15).fit(data).labels_

        gt = dataset[1]
        rock_results.append(AMI(rock, gt))
        other_results.append(np.max([AMI(kmeans, gt), AMI(dbscan, gt), AMI(spectral, gt), AMI(mean_shift, gt)]))

        run['ROCK'] = AMI(rock, gt)
        run['K_MEANS'] = AMI(kmeans, gt)
        run['DBSCAN'] = AMI(dbscan, gt)
        run['SPECTRAL'] = AMI(spectral, gt)
        run['MEAN_SHIFT'] = AMI(mean_shift, gt)

        step += 1

        experiment.append(run)

In [45]:
pd.DataFrame(experiment).to_csv('two_moons_analysis_jitter.csv')

In [46]:
mean_std = pd.DataFrame(experiment).groupby('jitter')[['ROCK', 'DBSCAN', 'K_MEANS', 'SPECTRAL', 'MEAN_SHIFT']].agg([np.mean, np.std])

In [2]:
plt.errorbar(mean_std.index, mean_std['ROCK']['mean'], yerr=mean_std['ROCK']['std'])
plt.errorbar(mean_std.index, mean_std['DBSCAN']['mean'], yerr=mean_std['DBSCAN']['std'])
plt.errorbar(mean_std.index, mean_std['K_MEANS']['mean'], yerr=mean_std['K_MEANS']['std'])
plt.errorbar(mean_std.index, mean_std['SPECTRAL']['mean'], yerr=mean_std['SPECTRAL']['std'])
plt.errorbar(mean_std.index, mean_std['MEAN_SHIFT']['mean'], yerr=mean_std['MEAN_SHIFT']['std'])

plt.legend(['ROCK', 'DBSCAN', 'K_MEANS', 'SPECTRAL', 'MEAN_SHIFT'])
plt.xlabel('Jitter')
plt.ylabel('Mean AMI')
plt.ylim(-0.02, 1)
plt.tight_layout()
plt.savefig('two_moons_num_jitter.png')

NameError: name 'plt' is not defined

In [49]:
from matplotlib import pyplot as plt

In [42]:
colors = ['red', 'green']

In [43]:
pyplot.scatter(x=datasets[3][0][:, 0], y=datasets[3][0][:, 1], marker='.', c=datasets[3][1])

IndexError: list index out of range

In [None]:
np.mean(np.array(rock_results) - np.array(other_results))

In [None]:
np.mean(rock_results) - np.mean(other_results)