# Unsupervised Topic Modeling with NMF

Goal: To discover 20 hidden topics from the "20 Newsgroups" dataset using TF-IDF and NMF.

In [1]:
# Step 1: Import Libraries & Load Data

import pandas as pd
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.utils import shuffle

print("Libraries imported successfully.")

Libraries imported successfully.


In [2]:
# We set 'remove' to ('headers', 'footers', 'quotes') to get just the text body.
# We will only use the 'train' set for our unsupervised learning.
data = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)

# The data's text is in data.data
# The "true" labels (which we will ignore for training) are in data.target
documents = data.data

# Let's see how many documents we have and an example
print(f"Loaded {len(documents)} documents.")
print("\n--- Example Document ---")
print(documents[0])

Loaded 11314 documents.

--- Example Document ---
I was wondering if anyone out there could enlighten me on this car I saw
the other day. It was a 2-door sports car, looked to be from the late 60s/
early 70s. It was called a Bricklin. The doors were really small. In addition,
the front bumper was separate from the rest of the body. This is 
all I know. If anyone can tellme a model name, engine specs, years
of production, where this car is made, history, or whatever info you
have on this funky looking car, please e-mail.


In [3]:
# Step 2: Vectorization (TF-IDF)

# We need to filter our words carefully.
# max_df=0.95: Ignore words that appear in > 95% of documents (too common, e.g., "the")
# min_df=2: Ignore words that appear in < 2 documents (too rare / typos)
# stop_words='english': Remove common English stop words
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words='english')

# Fit the vectorizer to our documents and transform them into a matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Get the feature names (the words in our vocabulary)
feature_names = tfidf_vectorizer.get_feature_names_out()

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary size: {len(feature_names)}")

TF-IDF matrix shape: (11314, 39115)
Vocabulary size: 39115


In [4]:
# Step 3: Train the NMF Model
# ---
# We will ask NMF to find 20 topics (n_components=20).

# %%
# Define the number of topics
num_topics = 20

# Initialize NMF
# n_init='auto' helps find a stable solution
# random_state=42 ensures our results are reproducible
nmf_model = NMF(n_components=num_topics, random_state=42, max_iter=500)

# Fit the NMF model to our TF-IDF matrix
# This step can take a minute or two
print("Fitting NMF model... (This may take a moment)")
nmf_model.fit(tfidf_matrix)

print("NMF model fitted successfully.")

Fitting NMF model... (This may take a moment)
NMF model fitted successfully.


In [5]:
# Step 4: Interpret the Topics

def display_topics(model, feature_names, num_top_words):
    """
    Prints the top words for each topic in an NMF model.
    """
    for topic_idx, topic in enumerate(model.components_):
        # model.components_ is shape [num_topics, num_features]
        # We sort the weights in descending order and get the indices
        top_word_indices = topic.argsort()[::-1]
        
        # Get the top N words
        top_words = [feature_names[i] for i in top_word_indices[:num_top_words]]
        
        print(f"Topic #{topic_idx + 1}:")
        print(" | ".join(top_words))
        print("-" * 80)

# Let's see the top 10 words for each of our 20 topics
num_top_words = 10
print(f"Displaying top {num_top_words} words for each of the {num_topics} topics:\n")
display_topics(nmf_model, feature_names, num_top_words)

Displaying top 10 words for each of the 20 topics:

Topic #1:
people | government | gun | right | law | guns | state | rights | think | make
--------------------------------------------------------------------------------
Topic #2:
file | files | program | ftp | directory | image | format | zip | gif | bmp
--------------------------------------------------------------------------------
Topic #3:
god | jesus | bible | believe | faith | christ | christian | christians | church | life
--------------------------------------------------------------------------------
Topic #4:
geb | dsl | n3jxp | chastity | cadre | pitt | shameful | intellect | skepticism | surrender
--------------------------------------------------------------------------------
Topic #5:
key | chip | encryption | clipper | keys | escrow | algorithm | government | use | secure
--------------------------------------------------------------------------------
Topic #6:
drive | disk | hard | drives | floppy | boot | cd | ide | 

In [6]:
# Step 5: Test the Model on New Documents

# We can get the test data from the same dataset
test_data = fetch_20newsgroups(subset='test', remove=('headers', 'footers', 'quotes'), shuffle=True, random_state=42)
test_documents = test_data.data

# Let's pick a random document
test_doc_text = test_documents[10]
print(f"--- Test Document --- \n{test_doc_text}\n" + "-"*80)

# 1. Transform the document using the *same* TF-IDF vectorizer
test_doc_tfidf = tfidf_vectorizer.transform([test_doc_text])

# 2. Transform the TF-IDF vector using the *fitted* NMF model
# This gives us a list of weights for each of the 20 topics.
topic_weights = nmf_model.transform(test_doc_tfidf)

# 3. Get the most likely topic
# We use argmax to find the index of the highest weight
dominant_topic_idx = topic_weights.argmax()
print(f"Predicted dominant topic index: {dominant_topic_idx + 1}")

# 4. Display the top words for that topic to confirm
print("\n--- Top Words for Predicted Topic ---")
topic_words = [feature_names[i] for i in nmf_model.components_[dominant_topic_idx].argsort()[::-1][:10]]
print(" | ".join(topic_words))

--- Test Document --- 
I have uploaded the Windows On-Line Review shareware edition to
ftp.cica.indiana.edu as /pub/pc/win3/uploads/wolrs7.zip.

It is an on-line magazine which contains reviews of some shareware
products...I grabbed it from the Windows On-Line BBS.

--
--------------------------------------------------------------------------------
Predicted dominant topic index: 12

--- Top Words for Predicted Topic ---
windows | dos | ms | version | running | os | microsoft | nt | drivers | driver
