In [11]:
%matplotlib inline
import numpy as np
from matplotlib.pyplot import *
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer, HashingVectorizer, TfidfTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.cross_validation import train_test_split

In [79]:
max_features = 10000
n_components = 256 #32

corpus_all = fetch_20newsgroups(subset="all", remove=('headers', 'footers', 'quotes'))
corpus_train, corpus_test, y_train, y_test = train_test_split(corpus_all.data, corpus_all.target, test_size=0.25, random_state=0)

pipeline = make_pipeline(
    TfidfVectorizer(max_features=max_features, norm=None),
    Normalizer(),
    StandardScaler(copy=False, with_mean=False),
    Normalizer(),
    TruncatedSVD(n_components=n_components),
    Normalizer()
)

X_train = pipeline.fit_transform(corpus_train)
X_test = pipeline.transform(corpus_test)

In [80]:
from sklearn.metrics.cluster import v_measure_score, homogeneity_score, completeness_score
from sklearn.cluster import MiniBatchKMeans, AgglomerativeClustering, DBSCAN, KMeans, Birch, MeanShift

algos = [
    KMeans(n_init=10, n_clusters=20),
    MiniBatchKMeans(n_init=10, n_clusters=20),
    #Birch(n_clusters=20),
    #MeanShift()
    #AgglomerativeClustering(n_clusters=20),
    #DBSCAN()
]

while algos:
    algo = algos.pop(0)
    y_predict = algo.fit_predict(X_train)
    print("algorithm: %s" % algo.__class__.__name__)
    print("  v-measure:    %0.2f" % v_measure_score(y_train, y_predict))
    print("  homogeneity:  %0.2f" % homogeneity_score(y_train, y_predict))
    print("  completeness: %0.2f" % completeness_score(y_train, y_predict))
    print("")
    del algo

algorithm: KMeans
  v-measure:    0.47
  homogeneity:  0.46
  completeness: 0.49

algorithm: MiniBatchKMeans
  v-measure:    0.39
  homogeneity:  0.38
  completeness: 0.40



In [81]:
from sklearn.svm import LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
y_pred = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred, target_names=corpus.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.57      0.53      0.55       205
           comp.graphics       0.64      0.60      0.62       245
 comp.os.ms-windows.misc       0.68      0.63      0.65       250
comp.sys.ibm.pc.hardware       0.65      0.64      0.65       243
   comp.sys.mac.hardware       0.74      0.70      0.72       255
          comp.windows.x       0.74      0.78      0.76       240
            misc.forsale       0.77      0.77      0.77       249
               rec.autos       0.46      0.75      0.57       219
         rec.motorcycles       0.78      0.74      0.76       246
      rec.sport.baseball       0.83      0.83      0.83       227
        rec.sport.hockey       0.89      0.87      0.88       287
               sci.crypt       0.78      0.78      0.78       234
         sci.electronics       0.64      0.61      0.63       247
                 sci.med       0.82      0.81      0.82       250
         

In [82]:
from sklearn.linear_model import SGDClassifier 
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
y_pred = OneVsRestClassifier(SGDClassifier(loss="hinge", random_state=0)).fit(X_train, y_train).predict(X_test)
print(classification_report(y_test, y_pred, target_names=corpus.target_names))

                          precision    recall  f1-score   support

             alt.atheism       0.57      0.52      0.55       205
           comp.graphics       0.70      0.54      0.61       245
 comp.os.ms-windows.misc       0.61      0.67      0.63       250
comp.sys.ibm.pc.hardware       0.71      0.44      0.54       243
   comp.sys.mac.hardware       0.55      0.73      0.63       255
          comp.windows.x       0.71      0.77      0.74       240
            misc.forsale       0.83      0.73      0.78       249
               rec.autos       0.78      0.64      0.70       219
         rec.motorcycles       0.72      0.76      0.74       246
      rec.sport.baseball       0.84      0.84      0.84       227
        rec.sport.hockey       0.91      0.86      0.88       287
               sci.crypt       0.81      0.75      0.78       234
         sci.electronics       0.58      0.64      0.60       247
                 sci.med       0.88      0.78      0.83       250
         

In [115]:
def clean_text(text, max_len=300):
    text = " ".join(text.strip().split())
    if max_len:
        text = text if len(text) < max_len else text[:max_len] + " [...]"
    return text

def find_similar(text, topn=10, max_len=300):
    print("SEARCH TEXT:")
    print(clean_text(text, max_len))
    print("")
    x_search = pipeline.transform([text])
    scores = x_search.dot(X_train.T)[0]
    index = np.argsort(scores)[::-1][:topn]
    for num, i in enumerate(index):
        print("RESULT %d [score: %0.2f, category: %s]:" % ((num+1), scores[i], corpus.target_names[y_train[i]]))
        print(clean_text(corpus_train[i], max_len))
        print("\n")

In [121]:
find_similar(corpus_test[300], topn=3, max_len=None)

SEARCH TEXT:
On the other hand, Rush made an interesting point: The Democrats ran one of their best campaigns in years against a pathetic Republican and a paranoiac and still only pulled 43% of the vote, lost 10 seats in the House, and gained 0 seats in the Senate. 1994 might be pretty interesting. Clueless of the world, take heart! 57% of the electorate is willing to vote for "a pathetic Republican and a paranoiac"!!

RESULT 1 [score: 0.67, category: talk.politics.guns]:
The Brady Bill passed the House in 1992, but failed to reach a vote in the Senate. As such, it never reached Bush. (Sarah Brady's condemnation not-withstanding). It'll probably pass the House again, and will probably pass the Senate if they can get it to a vote. Whether of not they'll be busy with other things will be the question. I don't expect gung-ho opposition on the part of Senate Republicans, since they won't want to over-use their fillibuster trump card.


RESULT 2 [score: 0.63, category: talk.politics.misc]:
