In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

In [2]:
from time import time
from sklearn.datasets import load_files

In [4]:
print('loading documents ...')
t = time()
docs = load_files('../../data/clustering/data')
print('summary: {0} documents in {1} categories.'.format(
    len(docs.data), len(docs.target_names)
))
print('done in {0} seconds'.format(time()-t))

loading documents ...
summary: 3949 documents in 4 categories.
done in 0.1471869945526123 seconds


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [6]:
max_feature = 20000
print('vectorizing documents ...')
t = time()
vectorizer = TfidfVectorizer(max_df=0.4, min_df=2, max_features=max_feature, encoding='latin-1')
X = vectorizer.fit_transform((d for d in docs.data))
print('n_samples: %d, n_features: %d' % X.shape)
print('number of non-zero features in sample [{0}]: {1}'.format(
    docs.filenames[0], X[0].getnnz()
))
print('done in {0} seconds'.format(time() - t))

vectorizing documents ...
n_samples: 3949, n_features: 20000
number of non-zero features in sample [../../data/clustering/data/sci.electronics/11902-54322]: 56
done in 1.003140926361084 seconds


In [8]:
from sklearn.cluster import KMeans

In [9]:
print('clustering documents ...')
t = time()
n_clusters = 4
kmean = KMeans(n_clusters=n_clusters, max_iter=100, tol=0.01, verbose=1, n_init=3)
kmean.fit(X)
print('kmean: k={}, cost={}'.format(n_clusters, int(kmean.inertia_)))
print('done in {0} seconds'.format(time() - t))

clustering documents ...
Initialization complete
Iteration  0, inertia 7585.179
Iteration  1, inertia 3842.962
Iteration  2, inertia 3833.521
Iteration  3, inertia 3829.528
Iteration  4, inertia 3826.735
Iteration  5, inertia 3825.316
Iteration  6, inertia 3824.575
Iteration  7, inertia 3824.155
Iteration  8, inertia 3823.712
Iteration  9, inertia 3823.208
Iteration 10, inertia 3822.461
Iteration 11, inertia 3821.598
Iteration 12, inertia 3821.221
Iteration 13, inertia 3820.839
Iteration 14, inertia 3820.168
Iteration 15, inertia 3819.122
Iteration 16, inertia 3818.816
Iteration 17, inertia 3818.596
Iteration 18, inertia 3818.441
Iteration 19, inertia 3818.340
Iteration 20, inertia 3818.234
Iteration 21, inertia 3818.145
Iteration 22, inertia 3818.075
Iteration 23, inertia 3818.028
Iteration 24, inertia 3818.006
Iteration 25, inertia 3817.955
Iteration 26, inertia 3817.913
Iteration 27, inertia 3817.892
Iteration 28, inertia 3817.881
Iteration 29, inertia 3817.874
Iteration 30, inertia

In [10]:
kmean.labels_[:100]

array([2, 2, 2, 2, 0, 2, 1, 2, 0, 2, 2, 2, 2, 0, 0, 2, 1, 2, 0, 2, 0, 2,
       0, 0, 0, 2, 2, 0, 1, 0, 2, 2, 2, 0, 2, 0, 2, 1, 3, 2, 0, 1, 0, 1,
       3, 3, 0, 2, 2, 2, 1, 2, 0, 0, 2, 2, 2, 2, 2, 0, 0, 1, 2, 2, 2, 0,
       2, 2, 2, 2, 2, 0, 2, 1, 0, 1, 2, 2, 2, 1, 2, 1, 0, 1, 1, 1, 2, 3,
       2, 2, 0, 0, 2, 1, 1, 2, 2, 2, 0, 0], dtype=int32)

In [11]:
docs.filenames[1000:1010]

array(['../../data/clustering/data/sci.crypt/10888-15289',
       '../../data/clustering/data/sci.crypt/11490-15880',
       '../../data/clustering/data/sci.crypt/11270-15346',
       '../../data/clustering/data/sci.electronics/12383-53525',
       '../../data/clustering/data/sci.space/13826-60862',
       '../../data/clustering/data/sci.electronics/11631-54106',
       '../../data/clustering/data/sci.space/14235-61437',
       '../../data/clustering/data/sci.crypt/11508-15928',
       '../../data/clustering/data/sci.space/13593-60824',
       '../../data/clustering/data/sci.electronics/12304-52801'],
      dtype='<U54')

In [13]:
print('Top term per cluster:')
order_centroids = kmean.cluster_centers_.argsort()[:, ::-1]
term = vectorizer.get_feature_names()
for i in range(n_clusters):
    print('Cluster %d:' % i, end='')
    for ind in order_centroids[i, :10]:
        print(' %s' % term[ind], end='')
    print()

Top term per cluster:
Cluster 0: space my we some by he who she do msg
Cluster 1: key clipper encryption chip government keys will escrow we nsa
Cluster 2: any know by my me ca anyone thanks will your
Cluster 3: henry toronto zoo spencer hst zoology mission utzoo orbit space
