# Exercise 1 - Topic Modeling

In this notebook, we will apply our understanding of topic modeling techniques like LDA and NMF

__Fill in the sections marked with `<YOUR CODE HERE>`__

## Import Libraries

In [1]:
import nltk
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import gensim

import re
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import NMF

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\xeroj\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\xeroj\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\xeroj\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
pd.set_option('display.max_colwidth', -1)

  pd.set_option('display.max_colwidth', -1)


## Get Dataset

For this assignment, we will use the __20 Newsgroup__ dataset. This dataset contains ~11k news articles spread across 20 news categories. The ``sklearn`` library provides an easy to use interface to get this dataset

In [5]:
newsgroups_train = fetch_20newsgroups(subset='train')

In [6]:
# view the news categories
newsgroups_train.target_names

['alt.atheism',
 'comp.graphics',
 'comp.os.ms-windows.misc',
 'comp.sys.ibm.pc.hardware',
 'comp.sys.mac.hardware',
 'comp.windows.x',
 'misc.forsale',
 'rec.autos',
 'rec.motorcycles',
 'rec.sport.baseball',
 'rec.sport.hockey',
 'sci.crypt',
 'sci.electronics',
 'sci.med',
 'sci.space',
 'soc.religion.christian',
 'talk.politics.guns',
 'talk.politics.mideast',
 'talk.politics.misc',
 'talk.religion.misc']

## Pre-process Text

## Question 1: Complete Regex to remove emails (1 point)

In [7]:
# Convert to list
data = newsgroups_train.data

# Remove Emails
data = [re.sub('\S*@\S*\s?', '', sent) for sent in data]

# Remove extra spaces \ new lines
data = [re.sub('\s+', ' ', sent) for sent in data]

# Remove distracting single quotes
data = [re.sub("\'", "", sent) for sent in data]

print(data[:1])

['From: (wheres my thing) Subject: WHAT car is this!? Nntp-Posting-Host: rac3.wam.umd.edu Organization: University of Maryland, College Park Lines: 15 I was wondering if anyone out there could enlighten me on this car I saw the other day. It was a 2-door sports car, looked to be from the late 60s/ early 70s. It was called a Bricklin. The doors were really small. In addition, the front bumper was separate from the rest of the body. This is all I know. If anyone can tellme a model name, engine specs, years of production, where this car is made, history, or whatever info you have on this funky looking car, please e-mail. Thanks, - IL ---- brought to you by your neighborhood Lerxst ---- ']


In [8]:
stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

## Question 2: Complete the `normalize_corpus` function (2 points)

__Note:__ Remove tokens with length 2 or more (as compared to 1 or more in Tutorial 1)

__Hint:__ The `normalize_corpus()` function in Tutorial 1 will come in handy here

In [9]:
def normalize_corpus(news_articles):
    norm_articles = []
    for article in tqdm(news_articles):
        article = article.lower()
        article_tokens = [token.strip() for token in wtk.tokenize(article)]
        article_tokens = [wnl.lemmatize(token) for token in article_tokens if not token.isnumeric()]
        article_tokens = [token for token in article_tokens if len(token) > 1]
        article_tokens = [token for token in article_tokens if token not in stop_words]
        article_tokens = list(filter(None, article_tokens))
        if article_tokens:
            norm_articles.append(article_tokens)
    return norm_articles

In [10]:
%%time

norm_data = normalize_corpus(data)
print(len(norm_data))

100%|██████████| 11314/11314 [00:32<00:00, 343.72it/s]

11314
Wall time: 32.9 s





# Topic Modeling with LDA

## Feature Engineering: Bi-Grams

## Question 3: Fill up the necessary code snippets to create a Bi-gram Bag of Words Model (1 point)

#### Build the bi-gram phrase model

__Note:__ Use `min_count` and `threshold` parameters similar to the tutorial 

In [11]:
bigram = gensim.models.Phrases(norm_data, 
                               min_count=20, 
                               threshold=20, 
                               delimiter=b'_')
bigram_model = gensim.models.phrases.Phraser(bigram)

print(bigram_model[norm_data[0]][:50])

['wheres', 'thing', 'subject', 'car', 'nntp_posting', 'host', 'rac3', 'wam', 'umd_edu', 'organization_university', 'maryland_college', 'park', 'line', 'wa_wondering', 'anyone', 'could', 'enlighten', 'car', 'saw', 'day', 'wa', 'door', 'sport', 'car', 'looked', 'late', '60', 'early', '70', 'wa', 'called', 'bricklin', 'door', 'really', 'small', 'addition', 'front', 'bumper', 'wa', 'separate', 'rest', 'body', 'know', 'anyone', 'tellme', 'model', 'name', 'engine', 'spec', 'year']


In [12]:
norm_corpus_bigrams = [bigram_model[doc] for doc in norm_data]

#### Generate the dictionary

In [13]:
# Create a dictionary representation of the documents.
dictionary = gensim.corpora.Dictionary(norm_corpus_bigrams)
print('Sample word to number mappings:', list(dictionary.items())[:15])
print('Total Vocabulary Size:', len(dictionary))

Sample word to number mappings: [(0, '60'), (1, '70'), (2, 'addition'), (3, 'anyone'), (4, 'body'), (5, 'bricklin'), (6, 'brought'), (7, 'bumper'), (8, 'called'), (9, 'car'), (10, 'could'), (11, 'day'), (12, 'door'), (13, 'early'), (14, 'engine')]
Total Vocabulary Size: 94305


#### Remove unnecessary terms

__Note:__ Use `no_below` and `no_above` parameters similar to the tutorial 

In [14]:
# Filter out words that occur less than 20 documents, 
# or more than 60% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.6)
print('Total Vocabulary Size:', len(dictionary))

Total Vocabulary Size: 7989


#### Create the Bag of Words model

In [15]:
# Transforming corpus into bag of words vectors
bow_corpus = [dictionary.doc2bow(text) for text in norm_corpus_bigrams]

In [16]:
# view sample transformation
print(bow_corpus[1][:50])

[(10, 2), (31, 1), (42, 1), (50, 1), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 2), (59, 1), (60, 1), (61, 5), (62, 1), (63, 1), (64, 1), (65, 1), (66, 2), (67, 1), (68, 2), (69, 1), (70, 1), (71, 1), (72, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 1), (78, 1), (79, 1), (80, 1), (81, 1), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 3), (88, 1), (89, 1), (90, 1), (91, 1), (92, 1), (93, 1), (94, 4), (95, 1), (96, 1)]


## Topic Modeling using LDA

### LDA using ``MALLET``
The MALLET framework is a Java-based package for statistical natural language processing, document classification, clustering, topic modeling, information extraction, and other machine learning applications to text. MALLET stands for __MA__chine __L__earning for __L__anguag __E__ __T__oolkit. It was developed by Andrew McCallum along with several people at the University of Massachusetts Amherst. The MALLET topic modeling toolkit contains efficient, sampling-based implementations of Latent Dirichlet Allocation, Pachinko Allocation, and Hierarchical LDA. To use MALLET’s capabilities, we need to download the framework.

In [33]:
!wget http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip

--2021-02-06 00:35:47--  http://mallet.cs.umass.edu/dist/mallet-2.0.8.zip
Resolving mallet.cs.umass.edu (mallet.cs.umass.edu)... 128.119.246.70
Connecting to mallet.cs.umass.edu (mallet.cs.umass.edu)|128.119.246.70|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16184794 (15M) [application/zip]
Saving to: 'mallet-2.0.8.zip.4'

     0K .......... .......... .......... .......... ..........  0%  554K 28s
    50K .......... .......... .......... .......... ..........  0% 1.15M 21s
   100K .......... .......... .......... .......... ..........  0% 3.96M 15s
   150K .......... .......... .......... .......... ..........  1% 4.89M 12s
   200K .......... .......... .......... .......... ..........  1% 1.48M 12s
   250K .......... .......... .......... .......... ..........  1% 4.69M 10s
   300K .......... .......... .......... .......... ..........  2% 1.76M 10s
   350K .......... .......... .......... .......... ..........  2% 2.52M 9s
   400K .......... .......... .

  6800K .......... .......... .......... .......... .......... 43% 11.5M 1s
  6850K .......... .......... .......... .......... .......... 43% 11.6M 1s
  6900K .......... .......... .......... .......... .......... 43% 10.4M 1s
  6950K .......... .......... .......... .......... .......... 44% 7.78M 1s
  7000K .......... .......... .......... .......... .......... 44% 13.9M 1s
  7050K .......... .......... .......... .......... .......... 44% 15.3M 1s
  7100K .......... .......... .......... .......... .......... 45% 10.4M 1s
  7150K .......... .......... .......... .......... .......... 45% 26.6M 1s
  7200K .......... .......... .......... .......... .......... 45% 12.2M 1s
  7250K .......... .......... .......... .......... .......... 46% 39.6M 1s
  7300K .......... .......... .......... .......... .......... 46% 26.8M 1s
  7350K .......... .......... .......... .......... .......... 46%  112M 1s
  7400K .......... .......... .......... .......... .......... 47% 30.9M 1s
  7450K ....

In [25]:
with ZipFile('mallet-2.0.8.zip', 'r') as zipObj:
      zipObj.extractall('mallet')

## Question 4: Build an LDA topic model with MALLET (1 point)

__Hint:__ Refer to the tutorial and use a similar configuration for the model settings (hyperparameters). __Also set the total topics to be 20__

In [17]:
%%time
TOTAL_TOPICS = 20
lda_model = gensim.models.LdaModel(corpus=bow_corpus, 
                                   id2word=dictionary, 
                                   chunksize=1740, 
                                   alpha='auto', 
                                   eta='auto', 
                                   random_state=42,
                                   iterations=500, 
                                   num_topics=TOTAL_TOPICS, 
                                   passes=20, 
                                   eval_every=None)

Wall time: 4min 23s


In [26]:
%%time

MALLET_PATH = 'mallet-2.0.8/bin/mallet'
lda_mallet = gensim.models.wrappers.LdaMallet(mallet_path=MALLET_PATH, 
                                              corpus=bow_corpus, 
                                              num_topics=TOTAL_TOPICS, 
                                              id2word=dictionary,
                                              iterations=500, 
                                              workers=4)

CalledProcessError: Command 'mallet-2.0.8/bin/mallet import-file --preserve-case --keep-sequence --remove-stopwords --token-regex "\S+" --input C:\Users\xeroj\AppData\Local\Temp\c9517b_corpus.txt --output C:\Users\xeroj\AppData\Local\Temp\c9517b_corpus.mallet' returned non-zero exit status 1.

__The model may take some time to run depending on your system config__

## Question 5: View Topics (1 point)

__Hint:__ The _View Topics_ section in Tutorial 1 might be useful here

In [None]:
topics = [[(term, round(wt, 3)) 
               for term, wt in lda_mallet.show_topic(n, topn=20)] 
                   for n in range(0, lda_mallet.num_topics)]
topics_df = pd.DataFrame([', '.join([term for term, wt in topic])  
                              for topic in topics],
                         columns = ['Terms per Topic'],
                         index=['Topic'+str(t) for t in range(1, lda_mallet.num_topics+1)]
                         )

topics_df

## Question 6: Evaluate Model Performance (1 point)

__Note:__ print the Cv and UMass coherence scores

In [None]:
cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, 
                                                      texts=norm_corpus_bigrams,
                                                      dictionary=dictionary, 
                                                      coherence='c_v')

avg_coherence_cv = cv_coherence_model_lda.get_coherence()

In [None]:
umass_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, 
                                                         texts=norm_corpus_bigrams,
                                                         dictionary=dictionary, 
                                                         coherence='u_mass')

avg_coherence_umass = umass_coherence_model_lda.get_coherence()

In [None]:
print('Avg. Coherence Score (Cv):', avg_coherence_cv)
print('Avg. Coherence Score (UMass):', avg_coherence_umass)

## Inference on documents

Here we will try to take some documents and predict \ infer their topics using our trained LDA model. Do note you can use any new documents also in this scenario but you would need to transform them into relevant bag of words vectors before predictions

#### Create a sample dataset of 3 documents

In [None]:
sample_docs = [' '.join(doc) for doc in norm_data[5:8]]
sample_docs

#### Check their class labels

Since this is actually a labeled dataset we can see the actual class \ category labels of these news posts

In [None]:
print(np.array(newsgroups_train.target_names)[newsgroups_train.target[5:8]])

## Question 7: Pre-process documents (1 point)

__Note:__ You can refer to Tutorial 1 or even refer to the steps above (before building them model)

1. Tokenize the sample documents to get list of words per document (string splitting is useful here)

2. Get bigram phrases for each tokenized document using `bigram_model`

3. Use the `dictionary` built previously in the above section to get the BOW vectors using `gensim`

In [None]:
# 1. Tokenize documents
tokenized_norm_docs = [<YOUR CODE HERE> for doc in sample_docs]

# 2. Bi-gram phrases for tokenized documents
bigram_data = [<YOUR CODE HERE> for doc in tokenized_norm_docs]

# 3. BOW vectors for each document
bow_vectorized_features = [<YOUR CODE HERE> for text in bigram_data]

## Question 8: Inference with trained topic model (1 point)

__Note:__ Use the trained `lda_mallet` model from above to predict and get the top (most dominant) topic per document. Remember to refer to the __Interpret Results__ section in Tutorial 1 if needed.

In [None]:
predicted_topics = <YOUR CODE HERE>
top_topics = <YOUR CODE HERE>
                     
final_topics = [(topic+1, weight) for topic, weight in top_topics]

In [None]:
print(final_topics)

In [None]:
[topics_df.loc['Topic'+str(topic_id)]['Terms per Topic'] 
    for topic_id, weight in final_topics]

# Topic Modeling using NMF

## Get list of documents

In [None]:
norm_docs = [' '.join(tokenized_doc) for tokenized_doc in norm_data]

## Question 9: Generate Bag of Words features (1 point)

__Note:__

1. Use `CountVectorizer` 
2. Set `min_df` as 20 and `max_df` as 0.6
3. Use both 1 and 2-grams

In [None]:
cv = <YOUR CODE HERE>
cv_features = <YOUR CODE HERE>

cv_features.shape

In [None]:
vocabulary = np.array(cv.get_feature_names())
print('Total Vocabulary Size:', len(vocabulary))

## Question 10: Train NMF Topic Model (1 point)

__Note:__ You can use a similar config as Tutorial 2

In [None]:
%%time 

nmf_model = <YOUR CODE HERE>
document_topics = <YOUR CODE HERE>

## Question 11: Display Topics and their Terms (2 points)

__Note:__ We have done a similar exercise in Tutorial 2

In [None]:
<YOUR CODE HERE>