In [8]:
import pandas as pd
import re
import numpy as np
from sklearn.datasets import fetch_20newsgroups
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from gensim import corpora, models
from gensim.models.ldamulticore import LdaMulticore
from gensim.models.coherencemodel import CoherenceModel
import pyLDAvis

In [9]:
newsgroups_train = fetch_20newsgroups(subset='train')

df = pd.DataFrame({'post': newsgroups_train['data'], 'target': newsgroups_train['target']})
df['target_names'] = df['target'].apply(lambda t: newsgroups_train['target_names'][t])
df.head()

Unnamed: 0,post,target,target_names
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space


In [10]:
def remove_urls(text):
    " removes urls"
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'', text)
    
def remove_html(text):
    " removes html tags"
    html_pattern = re.compile('')
    return html_pattern.sub(r'', text)

def remove_emails(text):
    email_pattern = re.compile('\S*@\S*\s?')
    return email_pattern.sub(r'', text)

def remove_new_line(text):
    return re.sub('\s+', ' ', text)

def remove_non_alpha(text):
    return re.sub("[^A-Za-z]+", ' ', str(text))

def preprocess_text(text):
    t = remove_urls(text)
    t = remove_html(t)
    t = remove_emails(t)
    t = remove_new_line(t)
    t = remove_non_alpha(t)
    return t

def lemmatize_words(text, lemmatizer):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

def remove_stopwords(text, stopwords):
    return " ".join([word for word in str(text).split() if word not in stopwords])


df['post_preprocessed'] = df['post'].apply(preprocess_text).str.lower()

print('lemming...')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
df['post_final'] = df['post_preprocessed'].apply(lambda post: lemmatize_words(post, lemmatizer))

print('remove stopwors...')

nltk.download('stopwords')
swords = set(stopwords.words('english'))

df['post_final'] = df['post_preprocessed'].apply(lambda post: remove_stopwords(post, swords))
df.head()

  email_pattern = re.compile('\S*@\S*\s?')
  return re.sub('\s+', ' ', text)


lemming...


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\dvija\AppData\Roaming\nltk_data...


remove stopwors...


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dvija\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,post,target,target_names,post_preprocessed,post_final
0,From: lerxst@wam.umd.edu (where's my thing)\nS...,7,rec.autos,from where s my thing subject what car is this...,thing subject car nntp posting host rac wam um...
1,From: guykuo@carson.u.washington.edu (Guy Kuo)...,4,comp.sys.mac.hardware,from guy kuo subject si clock poll final call ...,guy kuo subject si clock poll final call summa...
2,From: twillis@ec.ecn.purdue.edu (Thomas E Will...,4,comp.sys.mac.hardware,from thomas e willis subject pb questions orga...,thomas e willis subject pb questions organizat...
3,From: jgreen@amber (Joe Green)\nSubject: Re: W...,1,comp.graphics,from joe green subject re weitek p organizatio...,joe green subject weitek p organization harris...
4,From: jcm@head-cfa.harvard.edu (Jonathan McDow...,14,sci.space,from jonathan mcdowell subject re shuttle laun...,jonathan mcdowell subject shuttle launch quest...


In [11]:
posts = [x.split(' ') for x in df['post_final']]
id2word = corpora.Dictionary(posts)
corpus_tf = [id2word.doc2bow(text) for text in posts]
print(corpus_tf[0])

[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 5), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 1), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1)]


In [12]:
tfidf = models.TfidfModel(corpus_tf)
corpus_tfidf = tfidf[corpus_tf]
print(corpus_tfidf[0])

[(0, 0.11500201251132267), (1, 0.0971944436933538), (2, 0.10252640906710879), (3, 0.22953112112723342), (4, 0.11226181333665931), (5, 0.17231798689958852), (6, 0.07530879775427897), (7, 0.4309981461531588), (8, 0.08878613252802381), (9, 0.04578595800456591), (10, 0.07091621090714591), (11, 0.1222797634399791), (12, 0.1452632338168105), (13, 0.05244000065881918), (14, 0.09893773143340609), (15, 0.04078737614532067), (16, 0.11757726430276784), (17, 0.17438178713514216), (18, 0.10156507958998755), (19, 0.209512396606257), (20, 0.09696608998990563), (21, 0.024523540571880217), (22, 0.12966401666072794), (23, 0.08180537846078112), (24, 0.03563726564722526), (25, 0.11021948430467536), (26, 0.2495498456482098), (27, 9.460358508840665e-05), (28, 0.10745629673837069), (29, 0.07537813971938169), (30, 0.06671598768726121), (31, 0.06202615026926886), (32, 0.13627966869357622), (33, 0.1045507410155615), (34, 0.07651086978617286), (35, 0.17039187900871366), (36, 0.02490798437947303), (37, 0.00116419

In [13]:
model = LdaMulticore(corpus=corpus_tf,id2word = id2word, num_topics = 20,
                     alpha=.1, eta=0.1, random_state = 0)

coherence = CoherenceModel(model = model, texts = posts, dictionary = id2word, coherence = 'u_mass')

print(coherence.get_coherence())
print(model.show_topics())

-1.280233355152551
[(5, '0.015*"x" + 0.007*"lines" + 0.006*"organization" + 0.006*"one" + 0.006*"subject" + 0.005*"b" + 0.005*"would" + 0.004*"writes" + 0.004*"c" + 0.004*"article"'), (11, '0.007*"organization" + 0.006*"lines" + 0.006*"subject" + 0.006*"would" + 0.006*"article" + 0.005*"writes" + 0.004*"like" + 0.004*"one" + 0.004*"w" + 0.003*"x"'), (17, '0.006*"lines" + 0.006*"subject" + 0.005*"would" + 0.005*"organization" + 0.004*"x" + 0.004*"one" + 0.003*"god" + 0.003*"people" + 0.003*"article" + 0.003*"know"'), (12, '0.029*"ax" + 0.010*"q" + 0.009*"r" + 0.009*"g" + 0.008*"f" + 0.007*"p" + 0.007*"u" + 0.006*"v" + 0.006*"b" + 0.005*"w"'), (19, '0.017*"w" + 0.014*"x" + 0.007*"subject" + 0.005*"organization" + 0.005*"lines" + 0.004*"one" + 0.004*"writes" + 0.004*"people" + 0.004*"article" + 0.003*"u"'), (6, '0.008*"lines" + 0.008*"subject" + 0.007*"organization" + 0.006*"would" + 0.005*"one" + 0.005*"article" + 0.004*"writes" + 0.004*"get" + 0.004*"like" + 0.004*"university"'), (4, '0

In [24]:
import gensim
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

lda_display = gensimvis.prepare(model, corpus_tf, id2word, sort_topics = False)
pyLDAvis.display(lda_display)

  default_term_info = default_term_info.sort_values(


In [26]:
data_dict = {'dominant_topic':[], 'perc_contribution':[], 'topic_keywords':[]}

for i, row in enumerate(model[corpus_tf]):
    #print(i)
    row = sorted(row, key=lambda x: x[1], reverse=True)
    #print(row)
    for j, (topic_num, prop_topic) in enumerate(row):
        wp = model.show_topic(topic_num)
        topic_keywords = ", ".join([word for word, prop in wp])
        data_dict['dominant_topic'].append(int(topic_num))
        data_dict['perc_contribution'].append(round(prop_topic, 3))
        data_dict['topic_keywords'].append(topic_keywords)
        #print(topic_keywords)
        break

df_topics = pd.DataFrame(data_dict)

contents = pd.Series(posts)

df_topics.head()

Unnamed: 0,dominant_topic,perc_contribution,topic_keywords
0,9,0.856,"subject, lines, r, organization, article, e, o..."
1,11,0.935,"organization, lines, subject, would, article, ..."
2,14,0.74,"w, c, subject, lines, organization, x, article..."
3,18,0.742,"x, article, subject, organization, lines, c, w..."
4,13,0.55,"lines, subject, organization, writes, one, uni..."
