In [133]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
from time import time

In [155]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [95]:
import string
import pandas as pd
import numpy as np

In [55]:
from spacy.lang.en.stop_words import STOP_WORDS
import spacy
from spacy.lemmatizer import Lemmatizer
from spacy.lang.en import English
nlp=spacy.load('en_core_web_lg')


In [84]:
STOPS = set(string.punctuation) | STOP_WORDS

In [140]:
dataset = fetch_20newsgroups(shuffle=True, 
                             random_state=1, 
                             remove=('headers', 'footers', 'quotes'))

In [162]:
n_samples = 2000
n_features = 1000
n_components = 10
n_top_words = 20

In [163]:
data_samples = dataset.data[:n_samples]
type(data_samples)

list

In [164]:
len(data_samples)

11314

In [165]:
data_samples[0]

"Well i'm not sure about the story nad it did seem biased. What\nI disagree with is your statement that the U.S. Media is out to\nruin Israels reputation. That is rediculous. The U.S. media is\nthe most pro-israeli media in the world. Having lived in Europe\nI realize that incidences such as the one described in the\nletter have occured. The U.S. media as a whole seem to try to\nignore them. The U.S. is subsidizing Israels existance and the\nEuropeans are not (at least not to the same degree). So I think\nthat might be a reason they report more clearly on the\natrocities.\n\tWhat is a shame is that in Austria, daily reports of\nthe inhuman acts commited by Israeli soldiers and the blessing\nreceived from the Government makes some of the Holocaust guilt\ngo away. After all, look how the Jews are treating other races\nwhen they got power. It is unfortunate.\n"

In [166]:
#!python -m spacy download en_core_web_lg
# !pip install pyldavis

In [257]:
spacy.displacy.render(nlp(data_samples[3]), style='ent',jupyter=True)

In [120]:
for token in nlp(data_samples[0])[:10]:
    print(token,token.lemma_)

Well well
i -PRON-
'm be
not not
sure sure
about about
the the
story story
nad nad
it -PRON-


In [88]:
def spacy_tokenizer(corpus):
    doc = nlp(corpus)
    
    mod_doc = []
    for token in doc:
        if token.lemma_ != "-PRON-":
            word = token.lemma_.lower().strip()
            if word not in STOPS:
                mod_doc.append(word)
    
    return ' '.join(mod_doc)

In [124]:
np.array(list(map(spacy_tokenizer,data_samples[:3])))

array(['sure story nad biased  disagree statement u.s. media  ruin israels reputation rediculous u.s. media  pro israeli medium world live europe  realize incidence describe  letter occur u.s. medium try  ignore u.s. subsidize israels existance  europeans degree think  reason report clearly  atrocity  shame austria daily report  inhuman act commit israeli soldier blessing  receive government holocaust guilt  away look jews treat race  power unfortunate ',
       " yeah expect people read faq etc actually accept hard  atheism  need little leap faith jimmy  logic run  steam  jim  sorry pity jim  sorry feeling  denial faith need  oh pretend  end happily  maybe start new newsgroup  alt.atheist.hard bummin  bye bye big jim  forget flintstone 's chewables  :)  --  bake timmons iii",
       'realize principle strong  point like know ask question  sort arab country  want continue think tank charade  fixation israel stop  start ask  sort question arab country  realize  work arab country treatme

In [167]:
t0 = time()
mod_doc = np.array(list(map(spacy_tokenizer,data_samples)))
print("done in %0.3fs." % (time() - t0))

done in 657.700s.


In [169]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=10, 
                                strip_accents = 'unicode',
                                max_features=n_features,
                                stop_words='english', lowercase=True,
                                token_pattern=r'\b[a-zA-Z]{3,}\b')

In [170]:
t0 = time()
# tf = tf_vectorizer.fit_transform(mod_doc)
tf = tf_vectorizer.fit_transform(mod_doc)
print("done in %0.3fs." % (time() - t0))
print()

done in 5.136s.



In [151]:
# https://scikit-learn.org/stable/auto_examples/applications/plot_topics_extraction_with_nmf_lda.html#sphx-glr-auto-examples-applications-plot-topics-extraction-with-nmf-lda-py
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()

In [173]:
lda = LatentDirichletAllocation(n_components=20, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 20)

done in 21.120s.

Topics in LDA model:
Topic #0: use year program science high work test project provide result study research large cost report design time include level technology
Topic #1: think people like know way come talk president world good want look thing let yes time tell day kill country
Topic #2: god believe people jesus know think christian life religion bible good true thing question church evidence exist claim way argument
Topic #3: armenian turkish armenians russian turkey government greek history turks armenia million genocide military states arm army united letter world war
Topic #4: space nasa launch satellite gov mission orbit earth radio shuttle moon station lunar flight surface ray sun rocket source datum
Topic #5: file use program window image version available ftp windows display run entry server set application include code output edu user
Topic #6: book record king period shot van buf circuit average count difference save san flame los penalty goal performanc

In [157]:
lda = LatentDirichletAllocation(n_components=10, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, 10)

done in 3.593s.

Topics in LDA model:
Topic #0: game play team win player year good --- period season
Topic #1: maria excellent house poor condition btw soon tape field chance
Topic #2: people think know like use good law thing government right
Topic #3: key space use year probe mission chip launch earth moon
Topic #4: car good new buy brake bike year sell engine oil
Topic #5: health hiv point disease food aids number study care research
Topic #6: drive use disk card problem work hard run know controller
Topic #7: edu file mail com send program version include ftp available
Topic #8: god jesus bible church christian faith christ sin believe christians
Topic #9: people know come time like think leave woman kill happen



In [174]:
pyLDAvis.sklearn.prepare(lda, tf, tf_vectorizer)

## For Single topic

In [226]:
ten_doc = np.array(list(map(spacy_tokenizer,data_samples[:10])))
ten_doc[0]

'sure story nad biased  disagree statement u.s. media  ruin israels reputation rediculous u.s. media  pro israeli medium world live europe  realize incidence describe  letter occur u.s. medium try  ignore u.s. subsidize israels existance  europeans degree think  reason report clearly  atrocity  shame austria daily report  inhuman act commit israeli soldier blessing  receive government holocaust guilt  away look jews treat race  power unfortunate '

In [244]:
ten_tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, 
                                strip_accents = 'unicode',
                                max_features=3000,
                                stop_words='english', lowercase=True,
                                token_pattern=r'[a-zA-Z\-][a-zA-Z\-]{2,}')
ten_tf = ten_tf_vectorizer.fit_transform(ten_doc)

In [245]:
lda = LatentDirichletAllocation(n_components=10, max_iter=5,
                                learning_method='online',
                                learning_offset=50.,
                                random_state=0)

In [246]:
lda.fit(ten_tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [247]:
ten_tf_feature_names = ten_tf_vectorizer.get_feature_names()
print_top_words(lda, ten_tf_feature_names, 5)

Topic #0: read accept maybe expect run
Topic #1: know table maybe like israeli
Topic #2: table use like treat product
Topic #3: end year price real product
Topic #4: expect biased change new treat
Topic #5: work like want year wonder
Topic #6: israel question like realize work
Topic #7: israeli government power jews look
Topic #8: let change right new certainly
Topic #9: real certainly like run right



In [200]:
print_top_words(lda, single_tf_feature_names, 5)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=50.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=0, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [238]:
data_samples[1]

"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [248]:
ten_topic = lda.transform(ten_tf)

In [249]:
for i in ten_topic:
    print(i)

[0.00833476 0.00833483 0.00833493 0.00833485 0.00833492 0.00833488
 0.00833529 0.92498604 0.00833483 0.00833468]
[0.90998121 0.0100022  0.01000185 0.01000229 0.01000222 0.01000204
 0.0100018  0.01000194 0.01000201 0.01000245]
[0.00555638 0.00555665 0.00555649 0.00555633 0.00555639 0.00555674
 0.94999168 0.00555658 0.00555638 0.00555638]
[0.00833479 0.0083348  0.00833512 0.92498526 0.00833494 0.0083353
 0.00833493 0.00833489 0.00833506 0.00833492]
[0.01111314 0.01111318 0.01111301 0.01111304 0.01111307 0.01111327
 0.01111266 0.01111296 0.89998207 0.0111136 ]
[0.77494915 0.02500432 0.0250046  0.02500511 0.02500474 0.02500743
 0.02500616 0.02500592 0.02500554 0.02500701]
[0.01250271 0.01250318 0.01250278 0.01250224 0.01250207 0.01250241
 0.01250276 0.01250219 0.01250349 0.88747615]
[0.00500092 0.95499185 0.00500103 0.00500082 0.00500076 0.00500086
 0.00500103 0.00500086 0.0050009  0.00500096]
[0.0100011  0.01000219 0.90998546 0.01000179 0.01000133 0.01000202
 0.01000185 0.01000132 0.01000

In [250]:
for n in range(ten_topic.shape[0]):
    topic_most_pr = ten_topic[n].argmax()
    print("doc: {} topic: {}\n".format(n,topic_most_pr))

doc: 0 topic: 7

doc: 1 topic: 0

doc: 2 topic: 6

doc: 3 topic: 3

doc: 4 topic: 8

doc: 5 topic: 0

doc: 6 topic: 9

doc: 7 topic: 1

doc: 8 topic: 2

doc: 9 topic: 5



In [251]:
print_top_words(lda, ten_tf_feature_names, 5)

Topic #0: read accept maybe expect run
Topic #1: know table maybe like israeli
Topic #2: table use like treat product
Topic #3: end year price real product
Topic #4: expect biased change new treat
Topic #5: work like want year wonder
Topic #6: israel question like realize work
Topic #7: israeli government power jews look
Topic #8: let change right new certainly
Topic #9: real certainly like run right



In [252]:
ten_doc[2]

'realize principle strong  point like know ask question  sort arab country  want continue think tank charade  fixation israel stop  start ask  sort question arab country  realize  work arab country treatment jews  decade bad fixation israel  begin look like biased attack  group recognize stupid center  policy research fancy bigot  hat israel'

In [255]:
(lda.transform(ten_tf_vectorizer.transform([ten_doc[0]]))).argmax()

7