# Newsgroups Topic Modeling

The data set contains about 11,000 newsgroup articles categorized into 20 topics. We'll use Bag-of-Words model and Latent Dirichlet Allocation algorithm (from Python Sci-kit-Learn library) to predict the topic of a given article. LDA is an unsupervised machine learning method.

## Library Imports

In [16]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 100)

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set_style('darkgrid')

In [17]:
import spacy, re

In [18]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

In [19]:
# display tools
import pyLDAvis
import pyLDAvis.sklearn

In [20]:
# seed for the random number generator
SEED = 4321

### Basic Information

In [21]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')


In [22]:
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

The dataset is a list of strings wehre each string is the newsgroup article text. 
Some topic categories are likely to have many words in common, e.g. talk.politics.mideast and talk.politics.misc. Such articles would be difficult to differenciate using the bag-of-words method.

In [23]:
len(newsgroups_train.data)

11314

## Text Cleaning

In [24]:
# load the SpaCy English language model
nlp = spacy.load('en_core_web_sm')

In [25]:
my_stopwords = ['get', 'say', 'think', 'go', 'come', 'take', 'well', 'let', 'try', 'first', 'last',
                'organization', 'nntp', 'posting', 'host', 'keyword', 'http', 'fwd', 'cc', 'subject', 'not',
                'hereupon', 'something', 'mr', 'due', 'latter', 'though', 'their', '’re', 'specially', 'especially', 'probably', 
                'particularly', 'eventually',
                "'re", 'together', 'really', 'otherwise', 'then', 'n’t', 'mine', 'afterwards', "'ve", 'what', 'whole', 
                'enough', 'until', 'done', 'these', 'along', 'they', 'around', 'to', 'while', 'anyway', 'onto', 
                'almost', 'will', 'into', 'you', 'who', 'it', 'more', 'much', 'give', 'everything', 
                'same', 'hence', 'therefore', 'us', 'seem', 'am', 'the', 'yours', 'about', 'how', 'latterly', 'towards', 
                'others', 'or', 'part', 'other', "'m", 'few', 'beside', 'everyone', 'but', 'she', '‘re', 'myself', 
                'many', 'wherein', 'whenever', 'here', 'been', 'them', 'yourselves', 'itself', 'beyond', 'are', 
                'everywhere', 'in', 'may', '’ll', 'however', 'that', '’ve', 'should', 'can', 'elsewhere', 'via', '‘m', 
                'hereby', '‘ve', 'seeming', 'is', 'mostly', 'whence', 'only', 'least', 'although', 'whereas',
                'indeed', 'when', 'n‘t', 'did', 'even', 'those', 'of', 'have', 'always', 'both', 'among', 
                'had', "'d", 'being', 'we', 'your', 'whose', 'see', 'amongst', 'were', 'unless', 'also', 'anyhow', 
                'please', 'for', 'he', 'from', 'me', 'has', 'himself', 'per', 'yet', 'whither', 'seems', 'moreover', 
                're', 'anyone', 'ca', 'themselves', 'becoming', 'still', 'serious', 'thus', 'would', 'by', '’d', 'his',
                'therefore', 'rather', 'upon', 'somehow', 'thereupon', 'become', 'as', 'somewhere', '‘ll', 'whoever', 
                "'ll", 'five', 'there', 'any', 'toward', 'often', 'whereupon', 'within', '‘s', 'its', 'her',
                'a', '’m', 'else', 'several', 'our', 'thru', 'one', 'all', 'wherever', 'every', 'further',
                'regarding', 'whether', 'which', 'off', 'just', "'s", 'each', 'i', 'must', '’s', 'hereafter', 'be', 
                'another', 'very', 'doing', 'thereafter', 'yourself', 'herself', 'either', 'whereafter', 'some', 'was', 'do', 
                'using', 'now', 'ours', 'herein', 'nowhere', 'formerly', 'anywhere', 'does', 'someone', 
                'became', 'ever', 'various', 'sometimes', 'at', 'thence', 'nevertheless', 
                'meanwhile', 'my', 'with', '‘d', 'could', 'than', 'therein', 'namely', 'so', 'where', 
                'seemed', 'whatever', 'becomes', 'such', 'him', 'besides', 'if', 'throughout', 'whereby', 
                'during', 'already', 'again', 'thereby', 'why', 'most' , 'ourselves', 'on', 'quite', 'this', 'too', 
                'once', 'perhaps', 'through', 'noone', 'sometime', 'might', 'beforehand', 'nobody', 'and', 
                'across', 'anything', 'whom', 'hers', 'an', "n't"]

In [26]:
# tokenize, remove uninformative words, punctualtion, and symbols, and lemmatize a string
# return a space-delimited string of clean tokens

# digits which part of a proper noun are retained. 
# e.g. "G8", "run4cure"

def clean_text(string):
    document = nlp(string)
    
    clean_tokens = []
    for token in document:
        
        # remove tokens - numbers/numerals, punctuation, symbols, emojis, 
        # pronouns, auxilliary verbs, and most stop-words using part-of-speech (POS) tags
        
        if (len(token.text) > 1) and (token.pos_ in ['NOUN', 'PROPN', 'VERB', 'ADJ', 'ADV', 'PART']):
                
                # convert token's lemma to lowercase
                lower_lemma = token.lemma_.lower()
                
                # remove stopwords
                if lower_lemma not in my_stopwords:
                    
                    # replace non-ascii characters, remaining punctuation and symbols from the lemma
                    clean_token = re.sub(r"[^a-zA-Z0-9]", "", lower_lemma)
                    
                    # remove remaining numeric tokens
                    if (not clean_token.isnumeric()):
                        clean_tokens.append(clean_token)

    return " ".join(clean_tokens)

In [27]:
# remove documents containing uuencoded text (a binary-to-text encoding scheme) which produce junk tokens


corpus = []
for i, doc in enumerate(newsgroups_train.data):
    if doc.find("bmp") == -1:
        # keep the not uuencoded document
    
        corpus.append(doc)                                                           

In [28]:
len(corpus)

11269

In [29]:
# remove emails
# create a corpus which is a list of documents. Each document is a string.

corpus = [re.sub(r"([\w.-]+)@([\w.-]+)", '', doc) for doc in corpus] 

In [30]:
# clean, tokenize, and lemmatize each document in the corpus
# create a list of strings where each string consists of space-delimited clean tokens of the document

clean_corpus = [clean_text(doc) for doc in corpus]

## Prepare Data

In [33]:
X_train, X_test = train_test_split(clean_corpus, test_size=0.25, random_state=SEED)

In [34]:
print(len(X_train))

8451


In [35]:
print(len(X_test))

2818


## Feature Extraction

In [36]:
# generate Bag-of-words model for the data
# each document is transformed into a word vector
# each word vector represents the word's frequency (count of occurrences) in the document

vectorizer = CountVectorizer(max_df=0.95, min_df=2, lowercase=False)
X_train_vectorized = vectorizer.fit_transform(X_train)

In [37]:
X_train_vectorized.shape

(8451, 30604)

In [38]:
X_test_vectorized = vectorizer.transform(X_test)

In [39]:
X_test_vectorized.shape

(2818, 30604)

## Model Selection

In [40]:
lda = LatentDirichletAllocation(n_components=10, learning_decay=0.7, learning_method='online', random_state=SEED)

In [41]:
lda.fit(X_train_vectorized)

LatentDirichletAllocation(learning_method='online', random_state=4321)

In [42]:
log_likelihood_train = lda.score(X_train_vectorized)
print(log_likelihood_train.round(3))

-8656405.948


In [43]:
log_likelihood_test = lda.score(X_test_vectorized)
print(log_likelihood_test.round(3))

-2727827.91


Higher the log likelihood, better the model. 

In [44]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = "Topic #%d: " % topic_idx
        message += " ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message)
    print()


In [45]:
# get each topic's keywords

feature_names = vectorizer.get_feature_names()
print_top_words(lda, feature_names, 20)

Topic #0: people write gun law make right article use believe mean know thing question point state way own good fact reason
Topic #1: ground wire msg homosexual gay man helmet article clayton outlet wiring neutral de sexual cramer research duo california write male
Topic #2: god jesus christian bible church christ christians write sin know faith man life word love believe day die hell book
Topic #3: game team year player play win big good write hockey season article league university nhl division goal score oilers time
Topic #4: file use window program windows version information dos image server run available mail internet email include write university user thank
Topic #5: people israel government turkish president armenian war armenians jews know israeli kill clinton right world state write year attack country
Topic #6: car article write bike distribution drive new university good look buy ride road sale usa engine price oil mile driver
Topic #7: write time make know use article wor

In [46]:
# Create Document-Topic matrix
doc_topic_matrix = lda.transform(X_test_vectorized)
doc_topic_matrix.shape

(2818, 10)

In [47]:
# column names
topicnames = ["Topic" + str(i) for i in range(lda.n_components)]

# index names
docnames = ["Doc" + str(i) for i in range(len(X_test))]

In [48]:
# create the document-topic dataframe
df_document_topic = pd.DataFrame(np.round(doc_topic_matrix, 2), columns=topicnames, index=docnames)

# get dominant topic (with highest weight) for each document
dominant_topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['dominant_topic'] = dominant_topic

In [49]:
# styling
def color_green(val):
    color = 'green' if val > .1 else 'black'
    return 'color: {col}'.format(col=color)

def make_bold(val):
    weight = 700 if val > .1 else 400
    return 'font-weight: {weight}'.format(weight=weight)

# apply style
df_document_topics = df_document_topic.head(10).style.applymap(color_green).applymap(make_bold)
df_document_topics

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,Topic7,Topic8,Topic9,dominant_topic
Doc0,0.27,0.0,0.48,0.0,0.01,0.03,0.0,0.13,0.03,0.04,2
Doc1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.55,0.38,0.06,7
Doc2,0.03,0.04,0.0,0.03,0.01,0.06,0.43,0.34,0.0,0.05,6
Doc3,0.21,0.0,0.0,0.77,0.0,0.0,0.0,0.0,0.0,0.0,3
Doc4,0.31,0.0,0.05,0.02,0.0,0.06,0.0,0.55,0.0,0.01,7
Doc5,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.17,0.76,0.0,8
Doc6,0.01,0.01,0.01,0.28,0.01,0.01,0.28,0.39,0.01,0.01,7
Doc7,0.24,0.0,0.09,0.26,0.0,0.0,0.18,0.2,0.0,0.0,3
Doc8,0.34,0.0,0.1,0.02,0.02,0.0,0.05,0.4,0.01,0.06,7
Doc9,0.64,0.0,0.07,0.06,0.0,0.05,0.0,0.16,0.0,0.0,0


In [50]:
# review topic distribution

df_topic_distribution = df_document_topic['dominant_topic'].value_counts().reset_index(name="num_documents")
df_topic_distribution.columns = ['topic_id', 'num_documents']
df_topic_distribution

Unnamed: 0,topic_id,num_documents
0,7,820
1,8,604
2,0,325
3,4,283
4,3,262
5,6,193
6,5,133
7,9,107
8,2,75
9,1,16


In [51]:
pyLDAvis.enable_notebook()
panel = pyLDAvis.sklearn.prepare(lda, X_test_vectorized, vectorizer, mds='tsne')
panel