In [1]:
from lda2vec import preprocess, Corpus
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline

try:
    import seaborn
except:
    pass

# Run lda2vec before this:

# Topic descriprtion and distribution of lda2vec

In [48]:
npz = np.load(open('/Users/vijethlomada/Documents/6007/vagrant/p2/topics.pyldavis.npz', 'r'))
dat = {k: v for (k, v) in npz.iteritems()}
dat['vocab'] = dat['vocab'].tolist()

In [49]:
top_n = 10
topic_to_topwords = {}
for j, topic_to_word in enumerate(dat['topic_term_dists']):
    top = np.argsort(topic_to_word)[::-1][:top_n]
    msg = 'Topic %i '  % j
    top_words = [dat['vocab'][i].strip()[:35] for i in top]
    msg += ' '.join(top_words)
    print msg
    topic_to_topwords[j] = top_words

Topic 0 out_of_vocabulary motif windows string imake gcc colormap code widget openwindows
Topic 1 out_of_vocabulary satellites ankara satellite hiv istanbul ve launch aerospace launches
Topic 2 out_of_vocabulary secure rsa nsa clipper morality atheists escrow denning cryptography
Topic 3 out_of_vocabulary atheists morality god atheism moral doctrines eternal morals islam
Topic 4 out_of_vocabulary dog he him she her i watching playing was
Topic 5 out_of_vocabulary atheists morality god atheism moral eternal religious doctrines religions
Topic 6 out_of_vocabulary agencies escrow secure escrowed rsa nsa clipper government encryption
Topic 7 ankara out_of_vocabulary istanbul armenian ottoman turkish turkey azerbaijan kurds villages
Topic 8 out_of_vocabulary windows motif string <SKIP> { os mac 3.1 beta
Topic 9 out_of_vocabulary quadra scsi cable simms drive <SKIP> accelerator mac backup
Topic 10 out_of_vocabulary rangers teams hockey game quebec coach keenan players hitter
Topic 11 out_of_

## Using topic distribution as features for classification

In [24]:
from sklearn.cross_validation import train_test_split
from sklearn.datasets import fetch_20newsgroups
from sklearn.preprocessing import scale
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
remove = ('headers', 'footers', 'quotes')
X = dat["doc_topic_dists"]
y = fetch_20newsgroups(subset='train', remove=remove).target

X_train, X_test, y_train, y_test = train_test_split(X,y)



In [25]:

rf = RandomForestClassifier(n_estimators=1000)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)


accuracy_score(y_test, y_pred)

0.46836337928596677

In [26]:
from sklearn.cross_validation import cross_val_score

scores = cross_val_score(rf, X, y, cv=5)

# Accuracy lda2vec:

In [28]:
sum(scores)/5

0.46411938475298153

-------------

# Topic distribution and description for LDA

In [1]:
from gensim import corpora, models
from sklearn.datasets import fetch_20newsgroups
import numpy as np
import logging
import stop_words
from string import punctuation, digits
stop_w = stop_words.get_stop_words("english")
import logging

stop_w  = stop_w + ["say","we're","said","things","becae","jt","it's",'one','like','people','going','know',"that's",'think','see','really',"get","would","i'm","don't","us","actually","may","always","found","fact","lost","you've","end"]\
 + ["sided","something","thing","got","also","we've","there's","time","well","way","want","could","first","two","new","they're","you're","take","back","need","many","kind","ever","four","five","used","maybe","start"]\
+ ["you","it","will","can","0","1","2","3","4","5","6","7","8","9","just","them","now","me","MAXAXAXAXAXAXAXAXAXAXAXAXAXAXAX", "MG9VG9VG9VG9VG9VG9VG9VG9VG9VG9VG9VG9VG9VG9VG9V", "Q", "o", "R", "L","said","M" ]

## Data loading and preprocessing

In [2]:
logging.basicConfig()

# Fetch data
remove = ('headers', 'footers', 'quotes')
texts = fetch_20newsgroups(subset='train', remove=remove).data

def clean(line):
    txt = ' '.join(w for w in line.split() if w.lower() not in stop_w)
    return "".join(char for char in txt if char not in punctuation+digits)
# Preprocess data
max_length = 10000   # Limit of 10k words per document
# Convert to unicode (spaCy only works with unicode)
documents = [unicode(clean(d)) for d in texts]

# Count vectorizer

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

no_features = 1000

tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=no_features, stop_words='english')
tf = tf_vectorizer.fit_transform(documents)
tf_feature_names = tf_vectorizer.get_feature_names()

# LDA

In [45]:

from sklearn.decomposition import LatentDirichletAllocation

no_topics = 20

lda = LatentDirichletAllocation(n_topics=no_topics, max_iter=5, learning_method='online', learning_offset=50.,random_state=0).fit(tf)


def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):

        print "Topic %d:" % (topic_idx)," ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]])

no_top_words = 10
display_topics(lda, tf_feature_names, no_top_words)

Topic 0: government law gun public use states state right encryption rights
Topic 1: space data research program technology center nasa launch earth science
Topic 2: year team game season hockey win players league la vs
Topic 3: test st south posting miles fit search american west th
Topic 4: believe question point israel argument law different answer claim mean
Topic 5: problem sale offer problems condition keyboard driver best works work
Topic 6: information available list software package email university mail send computer
Topic 7: drive card disk scsi db mac hard video drives monitor
Topic 8: key chip bit window using keys use memory mhz chips
Topic 9: god jesus church christian religion christ world life bible gods
Topic 10: thanks email help post anybody looking advance hi bike send
Topic 11: car price buy ground cars work power pay cost wire
Topic 12: ago power game play water pts games hot baseball dave
Topic 13: maxaxaxaxaxaxaxaxaxaxaxaxaxaxax john dod according bob worked rd

## Using Gensim's implementation

In [6]:

docs = [text.split() for text in documents]
dictionary = corpora.Dictionary(docs)
corpus = [dictionary.doc2bow(text) for text in docs]
ldamodel = models.ldamodel.LdaModel(corpus, num_topics=20, id2word = dictionary, passes=20)

In [7]:
X = np.zeros((len(documents),20))
for i,d in enumerate(corpus):

    topic_vec = ldamodel[d]

    for j,weight in topic_vec:
        X[i,j] = weight


In [9]:
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import scale
from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier

y = fetch_20newsgroups(subset='train', remove=remove).target

rf = RandomForestClassifier(n_estimators=1000)

scores = cross_val_score(rf, X, y, cv=5)