In [None]:
import sys
sys.path.insert(1, '../')

In [None]:
import matplotlib.pyplot as pl
import numpy as np
import pandas as pd
from reprclust import sim, stability
from scipy.spatial.distance import squareform
from sklearn.feature_extraction.image import grid_to_graph
from sklearn.cross_validation import ShuffleSplit
from time import time

In [None]:
%pylab inline

In [None]:
# target RDMs
subordinate = np.array([[0,2.,2,2,2,2],
                        [2,0,2,2,2,2],
                        [2,2,0,2,2,2],
                        [2,2,2,0,2,2],
                        [2,2,2,2,0,2],
                        [2,2,2,2,2,0]])
subordinate = squareform(subordinate)

superordinate = np.array([[0,1.,2,2,2,2],
                          [1,0,2,2,2,2],
                          [2,2,0,1,2,2],
                          [2,2,1,0,2,2],
                          [2,2,2,2,0,1],
                          [2,2,2,2,1,0]])
superordinate = squareform(superordinate)


animate = np.array([[0,1.,1,2,2,2],
                    [1,0,1,2,2,2],
                    [1,1,0,2,2,2],
                    [2,2,2,0,1,1],
                    [2,2,2,1,0,1],
                    [2,2,2,1,1,0]])
animate = squareform(animate)

continuum = np.array([[0,.4,.8,1.2,1.6,2],
                      [.4,0,.4,.8,1.2,1.6],
                      [.8,.4,0,.4,.8,1.2],
                      [1.2,.8,.4,0,.4,.8],
                      [1.6,1.2,.8,.4,0,.4],
                      [2,1.6,1.2,.8,.4,0]])
continuum = squareform(continuum)

faces = np.array([[0,1.,2,2,2,2],
                  [1,0,2,2,2,2],
                  [2,2,0,2,2,2],
                  [2,2,2,0,2,2],
                  [2,2,2,2,0,2],
                  [2,2,2,2,2,0]])
faces = squareform(faces)

np.random.seed(42)
random_rsa = np.random.uniform(0, 2, 15)

In [None]:
rsas = ['subordinate', 'superordinate', 'animate',
        'continuum', 'faces', 'random_rsa']

fig = pl.figure(figsize=(12, 10))
for i, rsa in enumerate(rsas):
    ax = fig.add_subplot(2, 3, i+1)
    ax.imshow(squareform(eval(rsa)), interpolation='nearest')
    ax.set_title(rsa)
    ax.axis('off')
    pl.tight_layout()

In [None]:
radius_sphere = 6.5

In [None]:
# generate simulated data
np.random.seed(101)
a_clean, cluster_truth, dss = \
    sim.simple_sim1((50, 50), 
                    [subordinate, superordinate,animate, continuum, faces, random_rsa],
                    roi_neighborhood=sim.Sphere(radius_sphere),
                    nruns=1, nsubjects=10,
                    # noise components -- we just add normal for now also with
                    # spatial smoothing to possibly create difference in noise
                    # characteristics across different kinds
                    #
                    # "Instrumental noise" -- generic nuisance
                    noise_independent_std=5, noise_independent_smooth=1.5,
                    # "Intrinsic signal", specific per each subject (due to
                    # motion, whatever) -- might be fun for someone to cluster,
                    # but irrelevant for us
                    noise_subject_n=1, noise_subject_std=2, noise_subject_smooth=2.,
                    # "Intrinsic common signal" -- probably generalizes across
                    # subjects and fun for someone studying veins to get those
                    # reproducible clusters.  It will be mixed in also with
                    # different weights per each run.
                    # Again -- might be fun for someone to cluster, but not for us
                    # since it would not be representative of the original signal
                    noise_common_n=1, noise_common_std=0, noise_common_smooth=2.)

In [None]:
fig = figure()
ax = fig.add_subplot(111)
img = ax.imshow(cluster_truth, interpolation='nearest')
ax.set_title('Cluster Ground Truth');
ax.axis('off');
plt.colorbar(img);

In [None]:
# Extract the samples from the datasets and fisher transform them
samples = [np.arctanh(ds.samples) for ds in dss]

In [None]:
# Plot average across DSM
fig = figure(figsize=(12, 10))
for i in range(10):
    ax = fig.add_subplot(2, 5, i+1)
    ax.imshow(np.mean(samples[i], axis=0).reshape(*cluster_truth.shape))
    ax.axis('off')
    ax.set_title('Subject {0}'.format(i+1))
    pl.tight_layout()

In [None]:
# create 'connectivity' matrix for this 2D image
connectivity = grid_to_graph(*cluster_truth.shape)

In [None]:
n_splits = 20
max_k = 10
corr_score_type = 'pearson'

# run simulation for all methods -- takes around 10m on a quad-core macbook air
splitter = ShuffleSplit(len(samples), n_iter=n_splits, test_size=.5,
                        random_state=42)
result = {}
algorithms = {'complete': ('complete', {}),
              'ward-str': ('ward', {'connectivity': connectivity}), 
              'ward-unstr': ('ward', {}), 
              'kmeans': ('kmeans', {}), 
              'gmm-tied': ('gmm', {'covariance_type': 'tied'}),
              'gmm-diag': ('gmm', {'covariance_type': 'diag'}), 
              'gmm-sph': ('gmm', {'covariance_type': 'spherical'}), 
              'gmm-full': ('gmm', {'covariance_type': 'full'})}

for key, value in algorithms.iteritems():
    print 'Running {0}'.format(key),
    method, kwargs = value
    tstart = time()
    result[key] = stability.compute_stability(splitter, samples, n_jobs=-1, max_k=max_k,
                                              method=method, ground_truth=cluster_truth.reshape(-1,),
                                              corr_score=corr_score_type, verbose=0,
                                              **kwargs)
    print 'done in {0:.2f}s'.format(time()-tstart)

In [None]:
# save data for plotting with R
algos = []
out = []
for algo, res in result.iteritems():
    algos.append([algo]*len(res[0]))
    out.append(np.vstack(res[:4] + res[5:]).T)
out = np.vstack(out)
algos = np.hstack(algos).reshape(-1, 1)
d = np.hstack((out, algos))

In [None]:
df = pd.DataFrame(data=d, columns=['k', 'ARI', 'AMI', 'Instability',
                                   'ARI_GT', 'AMI_GT', 'Instability_GT', 
                                   'Correlation', 'Correlation_GT', 'Algorithm'])

In [None]:
fnout = 'simulation_results_rsph{0}_{1}.csv'.format(radius_sphere, corr_score_type)
df.to_csv(fnout)

In [None]:
# assuming packages reshape2, plyr, ggplot2 and R are installed, 
# run the following line to generate the image
!Rscript plot_simulated_results.R $fnout