In [1]:
import numpy as np

from gensim import matutils
from gensim.models.ldamodel import LdaModel
from sklearn import linear_model
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer


In [24]:


def print_features(clf, vocab, n=10):
    """ Print sorted list of non-zero features/weights. """
    coef = clf.coef_[0]
    print('positive features: %s' % (' '.join(['%s/%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[::-1][:n] if coef[j] > 0])))
    print('negative features: %s' % (' '.join(['%s/%.2f' % (vocab[j], coef[j]) for j in np.argsort(coef)[:n] if coef[j] < 0])))


def fit_classifier(X, y, C=0.1):
    """ Fit L1 Logistic Regression classifier. """
    # Smaller C means fewer features selected.
    clf = linear_model.LogisticRegression(penalty='l1', C=C)
    clf.fit(X, y)
    return clf


def fit_lda(X, vocab, num_topics=5, passes=20):
    """ Fit LDA from a scipy CSR matrix (X). """
    print('fitting lda...')
    return LdaModel(matutils.Sparse2Corpus(X), num_topics=num_topics,
                    passes=passes,
                    id2word=dict([(i, s) for i, s in enumerate(vocab)]))


In [25]:

# Load data.
rand = np.random.mtrand.RandomState(8675309)
cats = ['rec.sport.baseball', 'sci.crypt']
data = fetch_20newsgroups(subset='train',categories=cats,shuffle=True, random_state=rand)

vec = CountVectorizer(min_df=10, stop_words='english')
X = vec.fit_transform(data.data)
vocab = vec.get_feature_names()

# Fit classifier.
clf = fit_classifier(X, data.target)
print_features(clf, vocab)


positive features: clipper/1.50 code/1.24 key/1.04 encryption/0.95 government/0.37 chip/0.37 nsa/0.37 uk/0.36 org/0.23 cryptography/0.23
negative features: baseball/-1.32 game/-0.71 year/-0.61 team/-0.38 edu/-0.27 games/-0.27 players/-0.23 ball/-0.17 season/-0.14 phillies/-0.11


In [26]:

# Fit LDA.
lda = fit_lda(X, vocab)


fitting lda...


In [27]:
 print(lda.show_topics())

[(0, '0.020*"biochem" + 0.019*"cf" + 0.016*"generate" + 0.012*"17" + 0.007*"america" + 0.007*"authentication" + 0.007*"boggs" + 0.007*"d1" + 0.006*"expected" + 0.006*"contact"'), (1, '0.058*"contains" + 0.051*"digitized" + 0.036*"cf" + 0.026*"everybody" + 0.023*"gajarsky" + 0.020*"correct" + 0.017*"bound" + 0.016*"fenway" + 0.012*"announcement" + 0.012*"51"'), (2, '0.057*"correct" + 0.055*"contains" + 0.033*"cf" + 0.028*"er" + 0.022*"awful" + 0.021*"general" + 0.021*"famous" + 0.021*"162" + 0.020*"anderson" + 0.020*"asbestos"'), (3, '0.012*"dos" + 0.011*"disagree" + 0.010*"able" + 0.010*"features" + 0.010*"allegheny" + 0.010*"brothers" + 0.010*"electronically" + 0.010*"expansion" + 0.009*"entire" + 0.008*"c6"'), (4, '0.023*"fri" + 0.021*"d3" + 0.013*"gold" + 0.012*"close" + 0.009*"creation" + 0.008*"convert" + 0.008*"354" + 0.008*"att" + 0.008*"fresh" + 0.008*"funds"')]
