spherecluster　実装による解釈

In [1]:
from __future__ import print_function

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn import metrics

import numpy as np
from tabulate import tabulate

import logging
from sklearn.cluster import KMeans

import sys
sys.path.append('C:/Users/SHIO-160412-4/Desktop/spherecluster')
import spherecluster
from spherecluster import SphericalKMeans
from spherecluster import VonMisesFisherMixture

In [2]:
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [3]:
###############################################################################
# Optional params
use_LSA = False
n_components = 500

###############################################################################
# Load some categories from the training set
categories = [
    'alt.atheism',
    'talk.religion.misc',
    'comp.graphics',
    'sci.space',
]
# Uncomment the following to do the analysis on all the categories
#categories = None

print("Loading 20 newsgroups dataset for categories:")
print(categories)

Loading 20 newsgroups dataset for categories:
['alt.atheism', 'talk.religion.misc', 'comp.graphics', 'sci.space']


In [4]:
dataset = fetch_20newsgroups(subset='all', categories=categories,
                             shuffle=True, random_state=42)

In [5]:
print("%d documents" % len(dataset.data))
print("%d categories" % len(dataset.target_names))
print()

3387 documents
4 categories



In [6]:
labels = dataset.target
true_k = np.unique(labels).shape[0]

print("Extracting features from the training dataset using a sparse vectorizer")
vectorizer = TfidfVectorizer(stop_words='english', use_idf=True)
X = vectorizer.fit_transform(dataset.data)

print("n_samples: %d, n_features: %d" % X.shape)
print()

# table for results display
table = []

Extracting features from the training dataset using a sparse vectorizer
n_samples: 3387, n_features: 43255



In [7]:
# LSA for dimensionality reduction (and finding dense vectors)
# LSA : 潜在意味解析
if use_LSA:
  print("Performing dimensionality reduction using LSA")
  svd = TruncatedSVD(n_components)
  normalizer = Normalizer(copy=False)
  lsa = make_pipeline(svd, normalizer)
  X = lsa.fit_transform(X)

  explained_variance = svd.explained_variance_ratio_.sum()
  print("Explained variance of the SVD step: {}%".format(
      int(explained_variance * 100)))

  print()

### クラスタリングの評価指標一覧
Homogeneity: 同値性
Completeness: 完備性

V-Measere: 
Adj Rand:
Adj MI:
Silhouette:

In [8]:
# K-Means clustering
km = KMeans(n_clusters=true_k, init='k-means++', n_init=20)

print("Clustering with %s" % km)
km.fit(X)
print()

table.append([
    'k-means',
    metrics.homogeneity_score(labels, km.labels_),
    metrics.completeness_score(labels, km.labels_),
    metrics.v_measure_score(labels, km.labels_),
    metrics.adjusted_rand_score(labels, km.labels_),
    metrics.adjusted_mutual_info_score(labels, km.labels_),
    metrics.silhouette_score(X, km.labels_, metric='cosine')])

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, km.labels_))
print("Adjusted Mututal Information: %.3f"
      % metrics.adjusted_mutual_info_score(labels, km.labels_))
print("Silhouette Coefficient (euclidean): %0.3f"
      % metrics.silhouette_score(X, km.labels_, metric='euclidean'))
print("Silhouette Coefficient (cosine): %0.3f"
      % metrics.silhouette_score(X, km.labels_, metric='cosine'))

print()

Clustering with KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=4, n_init=20, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

Homogeneity: 0.351
Completeness: 0.456
V-measure: 0.397
Adjusted Rand-Index: 0.270
Adjusted Mututal Information: 0.350
Silhouette Coefficient (euclidean): 0.006
Silhouette Coefficient (cosine): 0.011



In [9]:
# Spherical K-Means clustering
skm = SphericalKMeans(n_clusters=true_k, init='k-means++', n_init=20)

print("Clustering with %s" % skm)
skm.fit(X)
print()

print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels, skm.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(labels, skm.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(labels, skm.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(labels, skm.labels_))
print("Adjusted Mututal Information: %.3f"
      % metrics.adjusted_mutual_info_score(labels, skm.labels_))
print("Silhouette Coefficient (euclidean): %0.3f"
      % metrics.silhouette_score(X, skm.labels_, metric='euclidean'))
print("Silhouette Coefficient (cosine): %0.3f"
      % metrics.silhouette_score(X, skm.labels_, metric='cosine'))

print()

table.append([
    'spherical k-means',
    metrics.homogeneity_score(labels, skm.labels_),
    metrics.completeness_score(labels, skm.labels_),
    metrics.v_measure_score(labels, skm.labels_),
    metrics.adjusted_rand_score(labels, skm.labels_),
    metrics.adjusted_mutual_info_score(labels, skm.labels_),
    metrics.silhouette_score(X, skm.labels_, metric='cosine')])



Clustering with SphericalKMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=4,
        n_init=20, n_jobs=1, normalize=True, random_state=None, tol=0.0001,
        verbose=0)

Homogeneity: 0.645
Completeness: 0.668
V-measure: 0.656
Adjusted Rand-Index: 0.661
Adjusted Mututal Information: 0.644
Silhouette Coefficient (euclidean): 0.006
Silhouette Coefficient (cosine): 0.012

