In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from time import time

In [376]:
from collections import Counter

In [155]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [95]:
import string
import pandas as pd
import numpy as np

In [324]:
from save_load import save_obj,load_obj

In [275]:
n_samples = len(data_samples)  # 2000
n_features = 1000
n_components = 10
n_top_words = 20

## Load Data generated from `clean_corpus.ipynb`.

In [310]:

mod_doc = load_obj('mod_doc')

In [311]:
mod_doc[0]

'sure story nad biased  disagree statement u.s. media  ruin israels reputation rediculous u.s. media  pro israeli medium world live europe  realize incidence describe  letter occur u.s. medium try  ignore u.s. subsidize israels existance  europeans degree think  reason report clearly  atrocity  shame austria daily report  inhuman act commit israeli soldier blessing  receive government holocaust guilt  away look jews treat race  power unfortunate '

In [319]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10, 
                                strip_accents = 'unicode',
                                max_features=5000,
                                stop_words='english', lowercase=True,
                                token_pattern=r'\b[a-zA-Z]{3,}\b')

In [320]:
t0 = time()
# tf = tf_vectorizer.fit_transform(mod_doc)
tf = tf_vectorizer.fit_transform(mod_doc)
print("done in %0.3fs." % (time() - t0))
print()

done in 2.378s.



In [321]:
# https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [422]:
def rank_words(vectorizer,tf):
    word_counts = tf.sum(axis=1)
    word_counts = np.asarray(word_counts.ravel())[0].tolist()
    words = tf_vectorizer.get_feature_names()
    
    words_list = list(zip(words, word_counts))
    
    return sorted(words_list, key= lambda t : t[1], reverse=True)

In [405]:
def common_words(vectorizer):
    words_dict = tf_vectorizer.vocabulary_
    return sorted(words_dict.items(), key= lambda t : t[1], reverse=True)

In [406]:
common_words(tf_vectorizer)[:10]

[('zoom', 4999),
 ('zone', 4998),
 ('zip', 4997),
 ('zionist', 4996),
 ('zionism', 4995),
 ('zero', 4994),
 ('zealand', 4993),
 ('yzerman', 4992),
 ('yup', 4991),
 ('youth', 4990)]

In [398]:
top_ten = rank_words(tf_vectorizer, tf)[:10]

In [399]:
top_ten 

[('highly', 4102),
 ('abs', 3909),
 ('score', 3901),
 ('santa', 3650),
 ('table', 3551),
 ('interface', 3336),
 ('desirable', 2792),
 ('lisp', 2055),
 ('hotel', 2035),
 ('friendly', 1896)]

In [407]:
lda = LatentDirichletAllocation(n_components=20, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 10)

done in 27.367s.

Topics in LDA model:
Topic #0: file use program window windows run version image display color
Topic #1: people gun right state law israel government kill war country
Topic #2: armenian armenians turkish people turkey russian kill woman greek armenia
Topic #3: president congress vote fbi house report press myers white koresh
Topic #4: edu com information mail send list available file include address
Topic #5: space launch nasa satellite earth air orbit mission program project
Topic #6: key use government encryption chip public clipper security law privacy
Topic #7: car use power ground engine good wire speed light high
Topic #8: god jesus believe christian people bible religion church man life
Topic #9: game team play player win season hockey league period score
Topic #10: bike keyboard motorcycle ride rider left turn shift won rid
Topic #11: disease study medical health offer condition patient sale drug new
Topic #12: know think use like good point post question peop

In [408]:
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [455]:
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=50, 
                                   max_features=5000,
                                   stop_words='english', lowercase=True)

In [456]:
t0 = time()
# tf = tf_vectorizer.fit_transform(mod_doc)
tfidf = tfidf_vectorizer.fit_transform(mod_doc)
print("done in %0.3fs." % (time() - t0))
print()

done in 2.777s.



In [457]:
common_words(tfidf_vectorizer)[:10]

[('zoom', 4999),
 ('zone', 4998),
 ('zip', 4997),
 ('zionist', 4996),
 ('zionism', 4995),
 ('zero', 4994),
 ('zealand', 4993),
 ('yzerman', 4992),
 ('yup', 4991),
 ('youth', 4990)]

In [458]:
rank_words(tfidf_vectorizer, tfidf)[:10]

[('interface', 15.64036715885867),
 ('table', 14.360831025528324),
 ('bed', 14.208717306092124),
 ('highly', 14.047837349802867),
 ('desirable', 13.919189819005037),
 ('military', 13.866216206038965),
 ('answer', 13.80428634018271),
 ('relief', 13.7998117236344),
 ('abs', 13.688291320345177),
 ('micro', 13.033006118050238)]

In [459]:
lda_idf = LatentDirichletAllocation(n_components=20, max_iter=5,
                                learning_method='batch',
                                learning_offset=10.,
                                random_state=0)
t0 = time()
lda_idf.fit_transform(tfidf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
print_top_words(lda_idf, tfidf_feature_names, 10)

done in 18.582s.

Topics in LDA model:
Topic #0: 00 sale condition offer shipping sell new good battery manual
Topic #1: space nasa year launch president think orbit people use 1993
Topic #2: msg gordon pitt surrender banks geb intellect cadre shameful chastity
Topic #3: lc list mailing edu hello thank adams ii jack information
Topic #4: game team play hockey playoff season player nhl leafs year
Topic #5: jews people think jewish know israel islam koresh good right
Topic #6: tank think plant water guess gas trial brian game regards
Topic #7: drive good car year use like know think time new
Topic #8: jet auto film vs old wear spare like pain chemical
Topic #9: government key gun people right law use encryption state phone
Topic #10: god people jesus think believe know christian bible life thing
Topic #11: key chip fbi warrant batf test escrow clipper video card
Topic #12: armenian armenians turkish turkey armenia turks genocide massacre serdar soviet
Topic #13: joke period pp lewis goal

In [460]:
t0 = time()
tfdif_vis = pyLDAvis.sklearn.prepare(lda_idf, tfidf, tfidf_vectorizer,sort_topics=False)
print("done in %0.3fs." % (time() - t0))

done in 3.079s.


In [461]:
tfdif_vis

## For Single topic

In [226]:
ten_doc = np.array(list(map(spacy_tokenizer,data_samples[:10])))
ten_doc[0]

'sure story nad biased  disagree statement u.s. media  ruin israels reputation rediculous u.s. media  pro israeli medium world live europe  realize incidence describe  letter occur u.s. medium try  ignore u.s. subsidize israels existance  europeans degree think  reason report clearly  atrocity  shame austria daily report  inhuman act commit israeli soldier blessing  receive government holocaust guilt  away look jews treat race  power unfortunate '

In [244]:
ten_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, 
                                strip_accents = 'unicode',
                                max_features=3000,
                                stop_words='english', lowercase=True,
                                token_pattern=r'[a-zA-Z\-][a-zA-Z\-]{2,}')
ten_tf = ten_tf_vectorizer.fit_transform(ten_doc)

In [245]:
lda = LatentDirichletAllocation(n_components=10, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [246]:
lda.fit(ten_tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [247]:
ten_tf_feature_names = ten_tf_vectorizer.get_feature_names()
print_top_words(lda, ten_tf_feature_names, 5)

Topic #0: read accept maybe expect run
Topic #1: know table maybe like israeli
Topic #2: table use like treat product
Topic #3: end year price real product
Topic #4: expect biased change new treat
Topic #5: work like want year wonder
Topic #6: israel question like realize work
Topic #7: israeli government power jews look
Topic #8: let change right new certainly
Topic #9: real certainly like run right



In [200]:
print_top_words(lda, single_tf_feature_names, 5)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [238]:
data_samples[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [248]:
ten_topic = lda.transform(ten_tf)

In [250]:
for n in range(ten_topic.shape[0]):
    topic_most_pr = ten_topic[n].argmax()
    print("doc: {} topic: {}\n".format(n,topic_most_pr))

doc: 0 topic: 7

doc: 1 topic: 0

doc: 2 topic: 6

doc: 3 topic: 3

doc: 4 topic: 8

doc: 5 topic: 0

doc: 6 topic: 9

doc: 7 topic: 1

doc: 8 topic: 2

doc: 9 topic: 5



In [251]:
print_top_words(lda, ten_tf_feature_names, 5)

Topic #0: read accept maybe expect run
Topic #1: know table maybe like israeli
Topic #2: table use like treat product
Topic #3: end year price real product
Topic #4: expect biased change new treat
Topic #5: work like want year wonder
Topic #6: israel question like realize work
Topic #7: israeli government power jews look
Topic #8: let change right new certainly
Topic #9: real certainly like run right



In [252]:
ten_doc[2]

'realize principle strong  point like know ask question  sort arab country  want continue think tank charade  fixation israel stop  start ask  sort question arab country  realize  work arab country treatment jews  decade bad fixation israel  begin look like biased attack  group recognize stupid center  policy research fancy bigot  hat israel'

In [255]:
(lda.transform(ten_tf_vectorizer.transform([ten_doc[0]]))).argmax()

7