In [2]:
%matplotlib inline
import numpy as np
from matplotlib.pyplot import *
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline

In [3]:
n_samples = None
max_features = 10000
n_components = 32

corpus = fetch_20newsgroups(subset="train")
pipeline = make_pipeline(
    TfidfVectorizer(max_features=max_features, norm=None),
    Normalizer(),
    StandardScaler(copy=False, with_mean=False),
    Normalizer(),
    TruncatedSVD(n_components=n_components),
    Normalizer()
)
X_test = pipeline.fit_transform(corpus.data[:n_samples])
y_test = np.array(corpus.target[:n_samples])
# scatter(X_test[:,0], X_test[:,1])

In [4]:
from sklearn.metrics.cluster import v_measure_score, homogeneity_score, completeness_score
from sklearn.cluster import MiniBatchKMeans, AgglomerativeClustering, DBSCAN, KMeans, Birch, MeanShift

algos = [
    KMeans(n_init=10, n_clusters=20),
    MiniBatchKMeans(n_init=10, n_clusters=20),
    Birch(n_clusters=20),
    #MeanShift()
    #AgglomerativeClustering(n_clusters=20),
    #DBSCAN()
]

while algos:
    algo = algos.pop(0)
    y_predict = algo.fit_predict(X_test)
    print("algorithm: %s" % algo.__class__.__name__)
    print("  v-measure:    %0.2f" % v_measure_score(y_test, y_predict))
    print("  homogeneity:  %0.2f" % homogeneity_score(y_test, y_predict))
    print("  completeness: %0.2f" % completeness_score(y_test, y_predict))
    print("")
    del algo

algorithm: KMeans
  v-measure:    0.56
  homogeneity:  0.55
  completeness: 0.58

algorithm: MiniBatchKMeans
  v-measure:    0.55
  homogeneity:  0.54
  completeness: 0.57

algorithm: Birch
  v-measure:    0.47
  homogeneity:  0.44
  completeness: 0.51

