In [1]:
# Import libraries
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import re
import nltk
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer
from sklearn.decomposition import LatentDirichletAllocation as LDA
from sklearn.decomposition import NMF

In [2]:
# Load the 20newsgroup dataset:
df = sklearn.datasets.fetch_20newsgroups(subset='all', return_X_y=True)

In [3]:
# Put the dataset into a dataframe
df_news = pd.DataFrame(df).T

# Add column name
df_news.columns = ['text', 'target']

In [4]:
df_news

Unnamed: 0,text,target
0,From: Mamatha Devineni Ratnam <mr47+@andrew.cm...,10
1,From: mblawson@midway.ecn.uoknor.edu (Matthew ...,3
2,From: hilmi-er@dsv.su.se (Hilmi Eren)\nSubject...,17
3,From: guyd@austin.ibm.com (Guy Dawson)\nSubjec...,3
4,From: Alexander Samuel McDiarmid <am2o+@andrew...,4
...,...,...
18841,From: jim.zisfein@factory.com (Jim Zisfein) \n...,13
18842,From: rdell@cbnewsf.cb.att.com (richard.b.dell...,12
18843,From: westes@netcom.com (Will Estes)\nSubject:...,3
18844,From: steve@hcrlgw (Steven Collins)\nSubject: ...,1


In [5]:
df_news['text'][0]

"From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>\nSubject: Pens fans reactions\nOrganization: Post Office, Carnegie Mellon, Pittsburgh, PA\nLines: 12\nNNTP-Posting-Host: po4.andrew.cmu.edu\n\n\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

In [6]:
# Utility function to clean text:

def text_cleaner(text):
# Newsgroup posters have fancy ASCII signatures (ahh, the late 80s/early 90s).
# Obliterate all of the repetitive punctuation that typifies them.
    text = re.sub(r'[-+=~_/\\|\^%:\.]{2,}',' ',text)
# Remove From lines, and the word "Subject: " when it is at the beginning of a line
    text = re.sub(r'^From.*','',text)
    text = re.sub(r'Subject:\s*',' ',text)
# Remove "\n" from text:
    text = re.sub(r'\\n',' ',text)
# Remove "words" that are nothing but spaces and digits, possibly with embedded parenthesis and dashes 
    text = re.sub(r'\s*[\(\)0-9]+\s*', '', text,flags=re.M)
# (This was a poorly-written rule that ended up eating strings of digits that provided useful information/context)

    text = " ".join(text.split())
    return text

In [7]:
df_news['cleaned_text'] = df_news['text'].apply(lambda x: text_cleaner(x))

In [8]:
df_news['text'][0]

"From: Mamatha Devineni Ratnam <mr47+@andrew.cmu.edu>\nSubject: Pens fans reactions\nOrganization: Post Office, Carnegie Mellon, Pittsburgh, PA\nLines: 12\nNNTP-Posting-Host: po4.andrew.cmu.edu\n\n\n\nI am sure some bashers of Pens fans are pretty confused about the lack\nof any kind of posts about the recent Pens massacre of the Devils. Actually,\nI am  bit puzzled too and a bit relieved. However, I am going to put an end\nto non-PIttsburghers' relief with a bit of praise for the Pens. Man, they\nare killing those Devils worse than I thought. Jagr just showed you why\nhe is much better than his regular season stats. He is also a lot\nfo fun to watch in the playoffs. Bowman should let JAgr have a lot of\nfun in the next couple of games since the Pens are going to beat the pulp out of Jersey anyway. I was very disappointed not to see the Islanders lose the final\nregular season game.          PENS RULE!!!\n\n"

In [9]:
# Creating the tf-idf matrix.
vectorizer = TfidfVectorizer(stop_words='english')
df_tfidf=vectorizer.fit_transform(df_news['cleaned_text'])

# Getting the word list.
terms = vectorizer.get_feature_names()

In [10]:
# Number of topics.
ntopics=20

# Linking words to topics
def word_topic(tfidf,solution, wordlist):
    
    # Loading scores for each word on each topic/component.
    words_by_topic=tfidf.T * solution

    # Linking the loadings to the words in an easy-to-read way.
    components=pd.DataFrame(words_by_topic,index=wordlist)
    
    return components

# Extracts the top N words and their loadings for each topic.
def top_words(components, n_top_words):
    n_topics = range(components.shape[1])
    index= np.repeat(n_topics, n_top_words, axis=0)
    topwords=pd.Series(index=index)
    for column in range(components.shape[1]):
        # Sort the column so that highest loadings are at the top.
        sortedwords=components.iloc[:,column].sort_values(ascending=False)
        # Choose the N highest loadings.
        chosen=sortedwords[:n_top_words]
        # Combine loading and index into a string.
        chosenlist=chosen.index +" "+round(chosen,2).map(str) 
        topwords.loc[column]=[x for x in chosenlist]
    return(topwords)

# Number of words to look at for each topic.
n_top_words = 10

In [11]:
# LSA

from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

svd= TruncatedSVD(ntopics)
lsa = make_pipeline(svd, Normalizer(copy=False))
df_lsa = lsa.fit_transform(df_tfidf)

components_lsa = word_topic(df_tfidf, df_lsa, terms)

topwords=pd.DataFrame()
topwords['LSA']=top_words(components_lsa, n_top_words)                

In [12]:
# LDA
from sklearn.decomposition import LatentDirichletAllocation as LDA

lda = LDA(n_components=ntopics, 
          doc_topic_prior=None, # Prior = 1/n_documents
          topic_word_prior=1/ntopics,
          learning_decay=0.7, # Convergence rate.
          learning_offset=10.0, # Causes earlier iterations to have less influence on the learning
          max_iter=10, # when to stop even if the model is not converging (to prevent running forever)
          evaluate_every=-1, # Do not evaluate perplexity, as it slows training time.
          mean_change_tol=0.001, # Stop updating the document topic distribution in the E-step when mean change is < tol
          max_doc_update_iter=100, # When to stop updating the document topic distribution in the E-step even if tol is not reached
          n_jobs=-1, # Use all available CPUs to speed up processing time.
          verbose=0, # amount of output to give while iterating
          random_state=0
         )

df_lda = lda.fit_transform(df_tfidf) 

components_lda = word_topic(df_tfidf, df_lda, terms)

topwords['LDA']=top_words(components_lda, n_top_words)

In [13]:
# NNMF

from sklearn.decomposition import NMF

nmf = NMF(alpha=0.0, 
          init='nndsvdar', # how starting value are calculated
          l1_ratio=0.0, # Sets whether regularization is L2 (0), L1 (1), or a combination (values between 0 and 1)
          max_iter=200, # when to sto p even if the model is not converging (to prevent running forever)
          n_components=ntopics, 
          random_state=0, 
          solver='cd', # Use Coordinate Descent to solve
          tol=0.0001, # model will stop if tfidf-WH <= tol
          verbose=0 # amount of output to give while iterating
         )
df_nmf = nmf.fit_transform(df_tfidf) 

components_nmf = word_topic(df_tfidf, df_nmf, terms)

topwords['NNMF']=top_words(components_nmf, n_top_words)

In [14]:
for topic in range(ntopics):
    print('Topic {}:'.format(topic))
    print(topwords.loc[topic])

Topic 0:
                   LSA                LDA          NNMF
0           edu 253.12           edu 8.23      car 3.71
0         lines 175.56           com 5.97     like 1.98
0  organization 174.23         lines 5.69     just 1.96
0           com 163.86  organization 5.68      edu 1.94
0       article 158.51       article 5.41      don 1.91
0    university 155.96    university 4.81     bike 1.78
0           don 143.14       posting 4.51      com 1.72
0        posting 141.3           don 4.49     good 1.69
0          just 138.47          host 4.35  article 1.68
0          like 137.68          know 4.22     know 1.56
Topic 1:
                LSA                LDA             NNMF
1         god 77.71           edu 6.13         god 12.6
1      people 40.68         lines 4.33       jesus 5.02
1       jesus 35.34  organization 4.18       bible 3.24
1   christian 25.28           com 3.78      people 3.03
1       bible 24.03    university 3.54     believe 2.75
1      believe 22.9       post