In [4]:
from sklearn.datasets import fetch_20newsgroups

categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]


dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

print("%d documents" % len(dataset.data))

print("%d categories" % len(dataset.target_names))

3387 documents
4 categories


In [9]:
import numpy as np

labels = dataset.target
true_k = np.unique(labels).shape[0]


4

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_df=0.5, min_df=2, stop_words='english', use_idf=True)

X = vectorizer.fit_transform(dataset.data)

In [17]:
X

<3387x24546 sparse matrix of type '<type 'numpy.float64'>'
	with 361973 stored elements in Compressed Sparse Row format>

In [19]:
from sklearn.cluster import KMeans

km = KMeans(n_clusters=true_k, init='k-means++', max_iter=100, n_init=1, verbose=True)
km.fit(X)

Initialization complete
Iteration  0, inertia 6458.244
Iteration  1, inertia 3309.427
Iteration  2, inertia 3298.916
Iteration  3, inertia 3294.767
Iteration  4, inertia 3292.893
Iteration  5, inertia 3291.302
Iteration  6, inertia 3289.662
Iteration  7, inertia 3288.239
Iteration  8, inertia 3287.437
Iteration  9, inertia 3286.692
Iteration 10, inertia 3285.774
Iteration 11, inertia 3284.295
Iteration 12, inertia 3283.191
Iteration 13, inertia 3282.891
Iteration 14, inertia 3282.623
Iteration 15, inertia 3282.413
Iteration 16, inertia 3282.353
Iteration 17, inertia 3282.209
Iteration 18, inertia 3282.014
Iteration 19, inertia 3281.989
Iteration 20, inertia 3281.981
Iteration 21, inertia 3281.967
Iteration 22, inertia 3281.955
Iteration 23, inertia 3281.952
Converged at iteration 23


KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=4, n_init=1,
    n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001,
    verbose=True)

In [22]:
from sklearn import metrics

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))


Homogeneity: 0.428
Completeness: 0.508
V-measure: 0.465
Adjusted Rand-Index: 0.363
Silhouette Coefficient: 0.006


In [31]:
from __future__ import print_function

print("Top terms per cluster:")

order_centroids = km.cluster_centers_.argsort()[:, ::-1]

terms = vectorizer.get_feature_names()
for i in range(true_k):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :30]:
        print(' %s' % terms[ind], end = '')
    print()


Top terms per cluster:
Cluster 0:
 god people com jesus say don bible christian article believe religion think just know atheism does evidence morality koresh mathew like way cs said university good atheists did islam jim
Cluster 1:
 sandvik sgi livesey kent com keith apple newton solntze wpd jon caltech morality article moral schneider allan objective alink ksand cco people cruel don cookamunga tourist private activities atheists bureau
Cluster 2:
 graphics com university space posting host nntp image thanks know computer like ac ca program cs article just uk file files software help 3d does images mail need distribution bit
Cluster 3:
 henry space nasa access toronto digex pat gov shuttle zoo hst spencer net mission orbit prb launch com jpl zoology article station baalke jsc like communications express dseg online ti
