#### This example uses a scipy.sparse matrix to store the features instead of standard numpy arrays
#### Two feature extraction methods can be used in this tutorial:
#### Tf-idf Vectorizer uses a in-memory vocabulary (a python dict) to map the most frequent words to features indices and hense compute a word occurrence frequency (sparse) matrix. The word frequencies are then re-weighted using the inverse Document Frequency (IDF) vector collected feature-wise over the corpus.
#### Hashing Vectorizer hashes word occurrences to a fixed dimensional space, possibly with collisions. The word count vectors are then normalized to each have l2-norm equal to one (projected to the euclidean unit-ball) which seems to be important for k-means to work in high-dimensional space.
#### HashingVectorizer does not provide IDF weighting as this is a stateless model (the fit method does nothing). When IDF weighting is needed it can be added by pipelining its output to a TfidfTransformer instance.

In [1]:
from __future__ import print_function

In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

In [3]:
from sklearn.cluster import KMeans, MiniBatchKMeans

In [4]:
import logging
from optparse import OptionParser
import sys
from time import time
import numpy as np

In [5]:
# displa progress logs on stdout
logging.basicConfig(level = logging.INFO, format='%(asctime)s %(levelname)s %(message)s')

In [6]:
# parse commandline arguments
op = OptionParser()
op.add_option("--lsa", dest="n_components", type="int", help="Preprocessing documents with latent semantic analysis")
op.add_option("--no-minibatch", action="store_false", dest="minibatch", default=True, help="Use ordinary k-means algorithm (in batch mode)")
op.add_option("--no-idf", action="store_false", dest="use_idf", default=True, help="Disable Inverse Document Frequency feature weighting")
op.add_option("--use-hashing", action="store_true", default=False, help="use a hashing feature vectorizer")
op.add_option("--n-features", type=int, default=10000, help="Maximum number of features (dimensions)" "to extract from test")
op.add_option("--vervise", action="store_true", dest="verbose", default=False, help="Print progress reports inside k-means algorithm")

print(__doc__)
op.print_help()

Automatically created module for IPython interactive environment
Usage: ipykernel_launcher.py [options]

Options:
  -h, --help            show this help message and exit
  --lsa=N_COMPONENTS    Preprocessing documents with latent semantic analysis
  --no-minibatch        Use ordinary k-means algorithm (in batch mode)
  --no-idf              Disable Inverse Document Frequency feature weighting
  --use-hashing         use a hashing feature vectorizer
  --n-features=N_FEATURES
                        Maximum number of features (dimensions)to extract from
                        test
  --vervise             Print progress reports inside k-means algorithm


In [7]:
def is_interactive():
    return not hasattr(sys.modules['__main__'], '__file__')

In [8]:
# work-around for jupyter notebook and IPython console
argv = [] if is_interactive() else sys.argv[1:]
(opts, args) = op.parse_args(argv)
if len(args) > 0:
    op.error("this script takes no arguments")
    sys.exit(1)

In [9]:
# load some categories from the training set
categories = ['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']

# uncomment the following to do the analysis on all the categories
# categories = None

print("loading 20 newsgroups dataset for categories:")
print(categories)

dataset = fetch_20newsgroups(subset='all', categories=categories, shuffle=True, random_state=42)

print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))

loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']
3387 documents
4 categories


In [10]:
labels = dataset.target
true_k = np.unique(labels).shape[0]

In [12]:
print("Extracting features from the training dataset using a sparse vectorizer")
t0 = time()
if opts.use_hashing:
    if opts.use_idf:
        # perform an idf normalization on the output of HashingVectorizer
        hasher = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm=None, binary=False)
        vectorizer = make_pipeline(hasher, TfidfTransformer())
    else:
        vectorizer = HashingVectorizer(n_features=opts.n_features, stop_words='english', alternate_sign=False, norm='l2', binary=False)
    
else:
    vectorizer = TfidfVectorizer(max_df=0.5, max_features=opts.n_features, min_df=2, stop_words='english', use_idf=opts.use_idf)

X = vectorizer.fit_transform(dataset.data)

print("done in %fs" % (time() - t0))
print("n_samples: %d, n_features: %d" %X.shape)

Extracting features from the training dataset using a sparse vectorizer
done in 1.104972s
n_samples: 3387, n_features: 10000


In [14]:
if opts.n_components:
    print("Performing dimensionality reduction using LSA")
    t0 = time()
    # vectorizer results are normalized, which makes KMeans behave as spherical
    # k-means for better results. Since LSA/SVD results are nor normalized, 
    # we need to re-construct the normarlization part
    svd = TruncatedSVD(opts.n_components)
    normalizer = Normalizer(copy=False)
    lsa = make_pipeline(svd, normalizer)
    
    X = lsa.fit_transform(X)
    
    print("done in %fs" % (time() - t0))
    
    explained_variance = svd.explained_variance_ration_sum()
    print("explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

In [17]:
# do the actual clustering
if opts.minibatch:
    km = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=opts.verbose)
else:
    km = KMeans(n_cluster=true_k, init='k-means++', max_iter = 100, n_init=1, verbose=opts.verbose)

print("Clustering sparse data with %s" %km)
t0 = time()
km.fit(X)
print("done in %0.3fs" %(time() - t0))

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

if not opts.use_hashing:
    print("Top terms per cluster")
    
    if opts.n_components:
        original_space_centroids = svd.inverse_transform(km.cluster_centers_)
        order_centroids = original_space_centroids.argsort()[:, ::-1]
    else:
        order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    
    terms = vectorizer.get_feature_names()
    for i in range(true_k):
        print("cluster %d:" %i, end='')
        for ind in order_centroids[i, :10]:
            print(' %s' % terms[ind], end='')

Clustering sparse data with MiniBatchKMeans(batch_size=1000, compute_labels=True, init='k-means++',
        init_size=1000, max_iter=100, max_no_improvement=10, n_clusters=4,
        n_init=1, random_state=None, reassignment_ratio=0.01, tol=0.0,
        verbose=False)
done in 0.130s
Homogeneity: 0.495
Completeness: 0.551
V-measure: 0.522
Adjusted Rand-Index: 0.450
Silhouette Coefficient: 0.008
Top terms per cluster
cluster 0: god com people sandvik jesus keith don say article thinkcluster 1: space nasa gov shuttle com launch moon like earth orbitcluster 2: henry access digex toronto pat alaska zoo net spencer prbcluster 3: graphics com university posting host nntp image thanks computer file