# 6.4 LDA

In [1]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import gensim
import gensim.corpora as corpora

### Load Data

In [2]:
# Load dataset from CSV file (the data should be in the same folder as your notebook)
data = pd.read_csv("news_articles.csv")

In [3]:
data.head()

Unnamed: 0,id,title,content
0,25626,"One Weight-Loss Approach Fits All? No, Not Eve...","Dr. Frank Sacks, a professor of nutrition at H..."
1,19551,South Carolina Stuns Baylor to Reach the Round...,South Carolina’s win over Duke was not only ...
2,25221,"U.S. Presidential Race, Apple, Gene Wilder: Yo...",(Want to get this briefing by email? Here’s th...
3,18026,"His Predecessor Gone, Gambia’s New President F...","BANJUL, Gambia — A week after he was inaugu..."
4,21063,‘Harry Potter and the Cursed Child’ Goes From ...,The biggest book of the summer isn’t a blockbu...


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       100 non-null    int64 
 1   title    100 non-null    object
 2   content  100 non-null    object
dtypes: int64(1), object(2)
memory usage: 2.5+ KB


### Clean Data

In [None]:
# Take just the content of the article, lowercase and remove punctuation
articles = data['content'].str.lower().apply(lambda x: re.sub(r"([^\w\s])", "", x))

# Stop word removal
en_stopwords = stopwords.words('english')
articles = articles.apply(lambda x: ' '.join([word for word in x.split() if word not in (en_stopwords)]))

# Tokenize
articles = articles.apply(lambda x: word_tokenize(x))

# Stemming (done for speed as we have a lot of text)
# Lemmatization would be the better choice if we have more time
ps = PorterStemmer()
articles = articles.apply(lambda tokens: [ps.stem(token) for token in tokens])

In [6]:
articles

0     [dr, frank, sack, professor, nutrit, harvard, ...
1     [south, carolina, win, duke, surpris, fan, pos...
2     [want, get, brief, email, here, good, even, he...
3     [banjul, gambia, week, inaugur, anoth, countri...
4     [biggest, book, summer, isnt, blockbust, thril...
                            ...                        
95    [want, get, brief, email, here, good, even, he...
96    [tallinn, estonia, guard, brought, ahm, abdul,...
97    [gov, scott, walker, wisconsin, activ, wiscons...
98    [social, media, shook, emot, headlin, shout, n...
99    [moment, joanna, acevedo, first, set, foot, bo...
Name: content, Length: 100, dtype: object

### Vectorization

In [7]:
# Create dictionary of all words
dictionary = corpora.Dictionary(articles)
print(dictionary)

Dictionary<8693 unique tokens: ['10', '100', '108', '15', '155']...>


In [8]:
# Vectorize using bag of words into a document term matrix
doc_term = [dictionary.doc2bow(text) for text in articles]
print(doc_term)

[[(0, 1), (1, 1), (2, 1), (3, 2), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 2), (15, 1), (16, 1), (17, 2), (18, 1), (19, 1), (20, 3), (21, 3), (22, 1), (23, 3), (24, 2), (25, 4), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 2), (38, 1), (39, 1), (40, 1), (41, 1), (42, 1), (43, 1), (44, 1), (45, 1), (46, 1), (47, 2), (48, 2), (49, 1), (50, 2), (51, 2), (52, 1), (53, 1), (54, 2), (55, 1), (56, 2), (57, 6), (58, 1), (59, 1), (60, 1), (61, 4), (62, 2), (63, 1), (64, 1), (65, 2), (66, 1), (67, 1), (68, 1), (69, 1), (70, 5), (71, 4), (72, 1), (73, 1), (74, 1), (75, 2), (76, 2), (77, 1), (78, 2), (79, 2), (80, 1), (81, 1), (82, 1), (83, 4), (84, 2), (85, 1), (86, 1), (87, 3), (88, 1), (89, 3), (90, 1), (91, 2), (92, 3), (93, 6), (94, 1), (95, 1), (96, 1), (97, 1), (98, 1), (99, 1), (100, 1), (101, 1), (102, 1), (103, 2), (104, 5), (105, 1), (106, 2), (107, 1), (108, 1), (109, 3), (110, 4)

### LDA

In [9]:
# Train LDA model with 2 topics
num_topics = 2

In [None]:
# Create LDA model
lda_model = gensim.models.LdaModel(corpus=doc_term,
                                   id2word=dictionary,
                                   num_topics=num_topics)

In [11]:
# Print the top 5 words for each topic
lda_model.print_topics(num_topics=num_topics, num_words=5)

[(0,
  '0.018*"said" + 0.016*"mr" + 0.005*"would" + 0.004*"like" + 0.004*"year"'),
 (1,
  '0.018*"mr" + 0.011*"said" + 0.008*"trump" + 0.005*"state" + 0.004*"one"')]

## Another Example

In [12]:
sample = ["The cat sat on the mat.", "Dogs are playing in the garden."]

# Preprocessing
sample = [re.sub(r"([^\w\s])", "", doc.lower()) for doc in sample]
sample = [' '.join([w for w in doc.split() if w not in en_stopwords]) for doc in sample]
sample = [word_tokenize(doc) for doc in sample]
sample = [[ps.stem(word) for word in doc] for doc in sample]

# Vectorize
sample_dict = corpora.Dictionary(sample)
sample_bow = [sample_dict.doc2bow(text) for text in sample]

# LDA
sample_lda = gensim.models.LdaModel(sample_bow, id2word=sample_dict, num_topics=2)
print(sample_lda.print_topics(num_words=3))

[(0, '0.214*"dog" + 0.211*"play" + 0.203*"garden"'), (1, '0.218*"sat" + 0.218*"mat" + 0.212*"cat"')]


## What I Learned

- corpora.Dictionary builds a mapping of words → ids needed for BoW

- doc2bow turns tokenized text into a vector of word frequencies.

- **LDA (Latent Dirichlet Allocation)** is an unsupervised learning method to extract hidden (latent) topics from a corpus.

- Each topic is represented as a list of weighted words, and each document is a mixture of topics.