In [1]:
import numpy as np
import sklearn.cluster
import sklearn.feature_extraction

In [2]:
# Generate a random data matrix. n = number of data points; m = length of each data point; num_types = number of types
def gen_data(n, m, num_types):
    # compute and print type for each data point (each from 0 to n-1)
    types = np.trunc(np.random.random_sample(n)*num_types)
    print np.array(map(int, types))
    # list of random vectors of m integers between 20 and 40, to serve as the mean for each type
    means = [np.random.randint(20,40,m) for t in range(num_types)]
    # list of vectors of standard deviations for each type
    stddevs = 1 * np.ones([num_types,m])
    # generate a data matrix and round to the nearest thousandth
    return np.round(map(lambda t : list(np.random.randn(len(means[t])) * stddevs[t] + means[t]), map(int, types)),3)

In [3]:
M = gen_data(40, 4, 5)
fit = sklearn.cluster.MeanShift().fit(M)
print fit.predict(M)

[3 4 2 1 4 3 3 1 3 3 2 2 2 2 2 4 4 0 0 3 2 1 0 3 1 0 2 0 1 3 1 2 4 0 3 1 3
 0 3 0]
[1 0 2 1 0 1 1 1 1 1 2 2 2 2 2 0 0 0 0 1 2 1 0 1 1 0 2 0 1 1 1 2 0 0 1 1 1
 0 1 0]


In [4]:
%%time
noun_train_mat = np.loadtxt("noun_train_mat.csv", delimiter = ",")
# use TF-IDF to scale each document's vector to have norm 1 and place a lower weight on very common words
tf_idf_fit = sklearn.feature_extraction.text.TfidfTransformer().fit(noun_train_mat)
noun_train_mat = tf_idf_fit.transform(noun_train_mat).toarray()

Wall time: 47.2 s


In [6]:
%%time
#fit = sklearn.cluster.MeanShift(bandwidth=10000).fit(noun_train_mat)

Wall time: 0 ns


In [12]:
%%time
bandwidths = [0.1, 0.25, 0.5, 0.75, 0.9, 0.95, 1, 10, 100]
fits = map(lambda bandwidth : sklearn.cluster.MeanShift(bandwidth=bandwidth).fit(noun_train_mat), bandwidths)

Wall time: 28min 6s


In [15]:
%%time
for bandwidth,fit in zip(bandwidths, fits):
    print 'BANDWIDTH:', bandwidth
    predictions = fit.predict(noun_train_mat)
    num_clusters = max(predictions) + 1
    print 'Number of clusters:', num_clusters
    print 'Number of documents per cluster:', [sum([x==cluster for x in predictions]) for cluster in range(num_clusters)]
    print

BANDWIDTH: 0.1
Number of clusters: 1110
Number of documents per cluster: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

In [6]:
%%time
# WARNING: runs for a long time (0.5-1 hour for me)
fit = sklearn.cluster.MeanShift().fit(M_tfidf)

In [11]:
num_clusters = max(fit.predict(noun_train_mat)) + 1
print 'Number of clusters:', num_clusters
print 'Number of documents per cluster:', [sum([x==cluster for x in fit.predict(noun_train_mat)]) for cluster in range(num_clusters)]

Number of clusters: 1
Number of documents per cluster: [1110]
