In [2]:
print(__doc__)

import time

import numpy as np
import matplotlib.pyplot as plt

from sklearn import cluster, datasets
from sklearn.neighbors import kneighbors_graph
from sklearn.preprocessing import StandardScaler

np.random.seed(0)

# Generate datasets. We choose the size big enough to see the scalability
# of the algorithms, but not too big to avoid too long running times
n_samples = 1500
noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5,
                                      noise=.05)
noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
no_structure = np.random.rand(n_samples, 2), None

colors = np.array([x for x in 'bgrcmykbgrcmykbgrcmykbgrcmyk'])
colors = np.hstack([colors] * 20)

clustering_names = [
    'MiniBatchKMeans', 'AffinityPropagation', 'MeanShift',
    'SpectralClustering', 'Ward', 'AgglomerativeClustering',
    'DBSCAN', 'Birch']

plt.figure(figsize=(len(clustering_names) * 2 + 3, 9.5))
plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
                    hspace=.01)

plot_num = 1

datasets = [noisy_circles, noisy_moons, blobs, no_structure]
for i_dataset, dataset in enumerate(datasets):
    X, y = dataset
    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(X)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=0.3)

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(X, n_neighbors=10, include_self=False)
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # create clustering estimators
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=2)
    ward = cluster.AgglomerativeClustering(n_clusters=2, linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=2,
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=.2)
    affinity_propagation = cluster.AffinityPropagation(damping=.9,
                                                       preference=-200)

    average_linkage = cluster.AgglomerativeClustering(
        linkage="average", affinity="cityblock", n_clusters=2,
        connectivity=connectivity)

    birch = cluster.Birch(n_clusters=2)
    clustering_algorithms = [
        two_means, affinity_propagation, ms, spectral, ward, average_linkage,
        dbscan, birch]

    for name, algorithm in zip(clustering_names, clustering_algorithms):
        # predict cluster memberships
        t0 = time.time()
        algorithm.fit(X)
        t1 = time.time()
        if hasattr(algorithm, 'labels_'):
            y_pred = algorithm.labels_.astype(np.int)
        else:
            y_pred = algorithm.predict(X)

        # plot
        plt.subplot(4, len(clustering_algorithms), plot_num)
        if i_dataset == 0:
            plt.title(name, size=18)
        plt.scatter(X[:, 0], X[:, 1], color=colors[y_pred].tolist(), s=10)

        if hasattr(algorithm, 'cluster_centers_'):
            centers = algorithm.cluster_centers_
            center_colors = colors[:len(centers)]
            plt.scatter(centers[:, 0], centers[:, 1], s=100, c=center_colors)
        plt.xlim(-2, 2)
        plt.ylim(-2, 2)
        plt.xticks(())
        plt.yticks(())
        plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                 transform=plt.gca().transAxes, size=15,
                 horizontalalignment='right')
        plot_num += 1

plt.show()

Automatically created module for IPython interactive environment


In [9]:
datasets
XX,yy = noisy_circles
#plt.scatter(noisy_circles[:, 0], noisy_circles[:, 1])

In [13]:
plt.scatter(XX[:,0], XX[:,1], color=colors[yy].tolist())

<matplotlib.collections.PathCollection at 0x10dac6110>

In [16]:
XX = StandardScaler().fit_transform(XX)

In [17]:
XX

array([[-1.21167878, -1.2468857 ],
       [ 1.6635849 ,  0.34300942],
       [ 0.97909317, -0.00958214],
       ..., 
       [-0.61711055, -0.63834488],
       [ 0.03029209, -1.68694721],
       [ 1.6320376 , -1.06842677]])

In [19]:
#from sklearn.cluster import spectural
#spectral = cluster.SpectralClustering(affinity='precomputed')

ImportError: cannot import name spectural

In [20]:
  spectral = cluster.SpectralClustering(n_clusters=2,
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")

In [21]:
spectral.fit(XX)

SpectralClustering(affinity='nearest_neighbors', assign_labels='kmeans',
          coef0=1, degree=3, eigen_solver='arpack', eigen_tol=0.0,
          gamma=1.0, kernel_params=None, n_clusters=2, n_init=10,
          n_neighbors=10, random_state=None)

In [22]:
from sklearn.metrics import pairwise

In [25]:
paired_dist_file = '/Users/Patrick/TMP/company_euc_dist'
import pandas as pd

In [26]:
pairs = pd.read_csv(paired_dist_file, sep='\t', header=None)



In [113]:
spect = cluster.SpectralClustering(n_clusters=3, 
                                   eigen_solver='arpack',
                                   affinity='precomputed')

In [38]:
a = np.array([[1,0,0,0],[0,1,0,0],[0,0,0,1],[1,0,1,0]])
spect.fit(a)              



SpectralClustering(affinity='rbf', assign_labels='kmeans', coef0=1, degree=3,
          eigen_solver='arpack', eigen_tol=0.0, gamma=1.0,
          kernel_params=None, n_clusters=3, n_init=10, n_neighbors=10,
          random_state=None)

In [39]:
spect.labels_

array([0, 1, 2, 0], dtype=int32)

In [58]:
def distance_to_affinity(distance):
#    similarity = np.exp(-beta * distance / distance.std())
    similarity = 1 - distance
    return(1-distance)

In [None]:
def merge_two_dict(dict_a, dict_b):
    dict_new = dict_a.copy()
    dict_new.update(dict_b)
    return dict_new

In [87]:
def get_pairs_dict(pairs):
    key_list_a = pairs.loc[:,0].map(str) + "_" + pairs.loc[:,1].map(str)
    key_list_b = pairs.loc[:,1].map(str) + "_" + pairs.loc[:,0].map(str)
    value_list = pairs.loc[:,2].map(float)
    dictionary_a = dict(zip(key_list_a, value_list))
    dictionary_b = dict(zip(key_list_b, value_list))
    
    return merge_two_dict(dictionary_a, dictionary_b)

In [88]:
get_pairs_dict(pairs)

{'1209_1232': 0.64006520066600003,
 '1209_1494': 0.59620809187599999,
 '1209_1514': 0.64002957765099999,
 '1209_1705': 0.63627670778599998,
 '1209_1774': 0.56941633638,
 '1209_1783': 0.60900179736000004,
 '1209_1948': 0.67190697153400003,
 '1209_2150': 0.74137651906800006,
 '1209_2248': 0.64306912732300003,
 '1209_2527': 0.63501977904100004,
 '1209_2600': 0.58737271901400001,
 '1209_2772': 0.62679711568700003,
 '1209_3122': 0.64598525671899998,
 '1209_3215': 0.69236615520300004,
 '1209_3295': 0.73391587791199997,
 '1209_3350': 0.72538169006600006,
 '1209_3354': 0.82078378393099993,
 '1209_3694': 0.76627839001800002,
 '1209_3764': 0.79494554314799992,
 '1209_3807': 0.49080520115299997,
 '1209_3986': 0.50792005391299999,
 '1209_4101': 0.74684127387300003,
 '1209_4120': 0.63294663899199999,
 '1209_4561': 0.52184551119199996,
 '1209_5061': 0.80343838204500007,
 '1209_5165': 0.72390943394899998,
 '1209_5634': 0.61823395394400005,
 '1209_5881': 0.51551734445800002,
 '1232_1209': 0.6400652006

In [166]:
def paired_to_matrix(pairs):
    unique_key_list = np.unique(pairs.loc[:,0:1])
    matrix_dim = len(unique_key_list)
    matrix = np.zeros((matrix_dim, matrix_dim))
    
    pairs_dict = get_pairs_dict(pairs)
    
    for i, key_a in enumerate(unique_key_list):
        for j, key_b in enumerate(unique_key_list):
            
            if key_a == key_b:
                value = 1
                matrix[i][j] = value
                
            else:
                key = str(key_a) + "_" + str(key_b)
                value = distance_to_affinity(pairs_dict[key])
                matrix[i][j] = value
                matrix[j][i] = value
                
    return matrix, unique_key_list
        
        

In [167]:
mm, key_list = paired_to_matrix(pairs)

In [116]:
spect.fit(mm)

SpectralClustering(affinity='precomputed', assign_labels='kmeans', coef0=1,
          degree=3, eigen_solver='arpack', eigen_tol=0.0, gamma=1.0,
          kernel_params=None, n_clusters=3, n_init=10, n_neighbors=10,
          random_state=None)

In [129]:
def eval_clustering_res(X, labels):
    from sklearn import metrics
    return metrics.silhouette_score(X, labels, metric='euclidean')
    

In [151]:
eval_clustering_res(mm, spect.labels_)


-0.037967419898167896

In [146]:
mm.shape

(29, 29)

In [187]:
def try_clustering_num(mm):
    label_dict = {}
    for i in range(2,29):
        
#        spect = cluster.SpectralClustering(n_clusters=i, 
#                                       eigen_solver='arpack',
#                                       affinity='precomputed')
        
        label = cluster.spectral_clustering(mm, eigen_solver='arpack', n_init=100)
            #spect.fit(mm)
        score = eval_clustering_res(mm, label)
        print i, score
        label_dict[str(i)] = label
        
    return label_dict

In [163]:
from sklearn import metrics
help(metrics.silhouette_score)

Help on function silhouette_score in module sklearn.metrics.cluster.unsupervised:

silhouette_score(X, labels, metric='euclidean', sample_size=None, random_state=None, **kwds)
    Compute the mean Silhouette Coefficient of all samples.
    
    The Silhouette Coefficient is calculated using the mean intra-cluster
    distance (``a``) and the mean nearest-cluster distance (``b``) for each
    sample.  The Silhouette Coefficient for a sample is ``(b - a) / max(a,
    b)``.  To clarify, ``b`` is the distance between a sample and the nearest
    cluster that the sample is not a part of.
    Note that Silhouette Coefficent is only defined if number of labels
    is 2 <= n_labels <= n_samples - 1.
    
    This function returns the mean Silhouette Coefficient over all samples.
    To obtain the values for each sample, use :func:`silhouette_samples`.
    
    The best value is 1 and the worst value is -1. Values near 0 indicate
    overlapping clusters. Negative values generally indicate that

In [188]:
label_dict = try_clustering_num(mm)

2 0.0939143628207
3 0.0844289494861
4 0.0885352001652
5 0.090713440243
6 0.097851330635
7 0.0956791734446
8 0.0830857380115
9 0.104956947316
10 0.0956791734446
11 0.110954867352
12 0.115275002835
13 0.0844289494861
14 0.0956791734446
15 0.0956791734446
16 0.0960340310832
17 0.133190870697
18 0.0956791734446
19 0.0956791734446
20 0.0978246550662
21 0.0978246550662
22 0.0960340310832
23 0.0960340310832
24 0.0844289494861
25 0.0830857380115
26 0.0830857380115
27 0.0960340310832
28 0.0819567844123


In [190]:
label_dict['17']

array([1, 5, 2, 2, 2, 1, 2, 2, 6, 4, 7, 2, 2, 7, 6, 0, 7, 3, 3, 3, 2, 1, 5,
       0, 4, 3, 6, 0, 4], dtype=int32)

In [191]:
dict(zip(key_list, label_dict['17']))

{1209: 1,
 1232: 5,
 1494: 2,
 1514: 2,
 1705: 2,
 1774: 1,
 1783: 2,
 1948: 2,
 2150: 6,
 2248: 4,
 2527: 7,
 2600: 2,
 2772: 2,
 3122: 7,
 3215: 6,
 3295: 0,
 3350: 7,
 3354: 3,
 3694: 3,
 3764: 3,
 3807: 2,
 3986: 1,
 4101: 5,
 4120: 0,
 4561: 4,
 5061: 3,
 5165: 6,
 5634: 0,
 5881: 4}

In [171]:
#pairwise.pairwise_distances(p)
aa = np.unique(pairs.loc[:,0:1])

In [172]:
aa[0:]

array([1209, 1232, 1494, 1514, 1705, 1774, 1783, 1948, 2150, 2248, 2527,
       2600, 2772, 3122, 3215, 3295, 3350, 3354, 3694, 3764, 3807, 3986,
       4101, 4120, 4561, 5061, 5165, 5634, 5881])

In [173]:
get_pairs_dict(pairs)

{'1209_1232': 0.64006520066600003,
 '1209_1494': 0.59620809187599999,
 '1209_1514': 0.64002957765099999,
 '1209_1705': 0.63627670778599998,
 '1209_1774': 0.56941633638,
 '1209_1783': 0.60900179736000004,
 '1209_1948': 0.67190697153400003,
 '1209_2150': 0.74137651906800006,
 '1209_2248': 0.64306912732300003,
 '1209_2527': 0.63501977904100004,
 '1209_2600': 0.58737271901400001,
 '1209_2772': 0.62679711568700003,
 '1209_3122': 0.64598525671899998,
 '1209_3215': 0.69236615520300004,
 '1209_3295': 0.73391587791199997,
 '1209_3350': 0.72538169006600006,
 '1209_3354': 0.82078378393099993,
 '1209_3694': 0.76627839001800002,
 '1209_3764': 0.79494554314799992,
 '1209_3807': 0.49080520115299997,
 '1209_3986': 0.50792005391299999,
 '1209_4101': 0.74684127387300003,
 '1209_4120': 0.63294663899199999,
 '1209_4561': 0.52184551119199996,
 '1209_5061': 0.80343838204500007,
 '1209_5165': 0.72390943394899998,
 '1209_5634': 0.61823395394400005,
 '1209_5881': 0.51551734445800002,
 '1232_1209': 0.6400652006

In [174]:
(pairs.loc[:,0].map(str) + "_" + pairs.loc[:,1].map(str), pairs.loc[:,2].map(float))

(0     1209_2772
 1     1209_1783
 2     1209_4101
 3     1209_5881
 4     1209_3295
 5     1209_3764
 6     1209_1774
 7     1209_1948
 8     1209_3122
 9     1209_4120
 10    1209_3694
 11    1209_2150
 12    1209_2248
 13    1209_3807
 14    1209_2600
 ...
 391    3350_5061
 392    3350_1705
 393    3350_5634
 394    3350_1514
 395    3350_3986
 396    5061_1705
 397    5061_5634
 398    5061_1514
 399    5061_3986
 400    1705_5634
 401    1705_1514
 402    1705_3986
 403    5634_1514
 404    5634_3986
 405    1514_3986
 Length: 406, dtype: object, 0     0.626797
 1     0.609002
 2     0.746841
 3     0.515517
 4     0.733916
 5     0.794946
 6     0.569416
 7     0.671907
 8     0.645985
 9     0.632947
 10    0.766278
 11    0.741377
 12    0.643069
 13    0.490805
 14    0.587373
 ...
 391    0.513481
 392    0.580103
 393    0.739395
 394    0.606033
 395    0.728010
 396    0.628866
 397    0.807144
 398    0.666364
 399    0.791433
 400    0.713607
 401    0.491297
 402    0.