In [1]:
%matplotlib inline
import numpy as np
from matplotlib.pyplot import *
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split

In [2]:
n_samples = None
max_features = 10000
n_components = 200 #32

corpus = fetch_20newsgroups(subset="all")
pipeline = make_pipeline(
    TfidfVectorizer(max_features=max_features, norm=None),
    Normalizer(),
    StandardScaler(copy=False, with_mean=False),
    Normalizer(),
    TruncatedSVD(n_components=n_components),
    Normalizer()
)

X_all = pipeline.fit_transform(corpus.data[:n_samples])
y_all = np.array(corpus.target[:n_samples])

X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=0.25, random_state=0)

In [3]:
from sklearn.metrics.cluster import v_measure_score, homogeneity_score, completeness_score
from sklearn.cluster import MiniBatchKMeans, AgglomerativeClustering, DBSCAN, KMeans, Birch, MeanShift

algos = [
    KMeans(n_init=10, n_clusters=20),
    MiniBatchKMeans(n_init=10, n_clusters=20),
    #Birch(n_clusters=20),
    #MeanShift()
    #AgglomerativeClustering(n_clusters=20),
    #DBSCAN()
]

while algos:
    algo = algos.pop(0)
    y_predict = algo.fit_predict(X_train)
    print("algorithm: %s" % algo.__class__.__name__)
    print("  v-measure:    %0.2f" % v_measure_score(y_train, y_predict))
    print("  homogeneity:  %0.2f" % homogeneity_score(y_train, y_predict))
    print("  completeness: %0.2f" % completeness_score(y_train, y_predict))
    print("")
    del algo

algorithm: KMeans
  v-measure:    0.52
  homogeneity:  0.50
  completeness: 0.54

algorithm: MiniBatchKMeans
  v-measure:    0.41
  homogeneity:  0.38
  completeness: 0.44



In [4]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
y_pred = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred, target_names=corpus.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.84      0.85      0.84       205
           comp.graphics       0.71      0.72      0.71       245
 comp.os.ms-windows.misc       0.74      0.76      0.75       250
comp.sys.ibm.pc.hardware       0.71      0.66      0.68       243
   comp.sys.mac.hardware       0.81      0.76      0.78       255
          comp.windows.x       0.85      0.84      0.84       240
            misc.forsale       0.83      0.88      0.85       249
               rec.autos       0.84      0.86      0.85       219
         rec.motorcycles       0.92      0.93      0.92       246
      rec.sport.baseball       0.92      0.94      0.93       227
        rec.sport.hockey       0.95      0.98      0.97       287
               sci.crypt       0.94      0.93      0.93       234
         sci.electronics       0.78      0.75      0.77       247
                 sci.med       0.90      0.89      0.89       250
         

In [5]:
from sklearn.linear_model import SGDClassifier 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
y_pred = OneVsRestClassifier(SGDClassifier(loss="hinge", random_state=0)).fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred, target_names=corpus.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.84      0.85      0.85       205
           comp.graphics       0.72      0.71      0.72       245
 comp.os.ms-windows.misc       0.62      0.85      0.72       250
comp.sys.ibm.pc.hardware       0.87      0.42      0.57       243
   comp.sys.mac.hardware       0.65      0.84      0.73       255
          comp.windows.x       0.85      0.80      0.83       240
            misc.forsale       0.86      0.87      0.86       249
               rec.autos       0.89      0.83      0.86       219
         rec.motorcycles       0.89      0.93      0.91       246
      rec.sport.baseball       0.91      0.96      0.94       227
        rec.sport.hockey       0.97      0.98      0.97       287
               sci.crypt       0.89      0.94      0.92       234
         sci.electronics       0.82      0.69      0.75       247
                 sci.med       0.93      0.89      0.91       250
         