In [10]:
import numpy as np
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from py_stm.stm import StmModel
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from sklearn.datasets import fetch_20newsgroups
from sklearn.model_selection import train_test_split

newsgroups = fetch_20newsgroups(subset='all', remove=('headers', 'footers', 'quotes'))

In [11]:
print(f"There are {len(newsgroups.data)} many documents in the newsgroups dataset")

print(f"Category breakdown (category : num_documents in category):")
for category, count in enumerate(np.bincount(newsgroups.target)):
    print(f"{category} : {count}")

There are 18846 many documents in the newsgroups dataset
Category breakdown (category : num_documents in category):
0 : 799
1 : 973
2 : 985
3 : 982
4 : 963
5 : 988
6 : 975
7 : 990
8 : 996
9 : 994
10 : 999
11 : 991
12 : 984
13 : 990
14 : 987
15 : 997
16 : 910
17 : 940
18 : 775
19 : 628


In [12]:
nltk.download('stopwords')  # Download the stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tylerholston/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [19]:
def preprocess_text(text):
    tokens = simple_preprocess(text, deacc=True, min_len=3)  # deacc=True removes punctuation
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return filtered_tokens

In [20]:
# Lets split the text here to demonstrate model save/load functionality later
train_text, test_text = train_test_split(newsgroups.data, test_size=0.2, random_state=42)

# Apply the preprocessing function to the text data
processed_train_text = [preprocess_text(text) for text in train_text]
processed_test_text = [preprocess_text(text) for text in test_text]

# Create the training dictionary
dictionary = Dictionary(processed_train_text)

# Filter extremes (remove tokens that appear in less than 10 documents, or more than 50% of the documents)
dictionary.filter_extremes(no_below=10, no_above=0.5)

# Create the training corpus (bag of words representation)
corpus_train = [dictionary.doc2bow(text) for text in processed_train_text]

In [21]:
# Train the STM model
num_topics = 5  # Define the number of topics you want to extract
stm = StmModel(corpus_train, num_topics=num_topics, id2word=dictionary, passes=2, random_state=42)

In [22]:
topics = stm.print_topics()
for topic in topics:
    print(topic)

(0, '0.008*"would" + 0.007*"space" + 0.006*"one" + 0.004*"also" + 0.004*"time" + 0.004*"like" + 0.004*"could" + 0.004*"may" + 0.004*"use" + 0.003*"much"')
(1, '0.012*"edu" + 0.009*"dos" + 0.009*"windows" + 0.007*"use" + 0.007*"com" + 0.007*"file" + 0.006*"software" + 0.006*"image" + 0.005*"jpeg" + 0.005*"program"')
(2, '0.009*"one" + 0.009*"would" + 0.008*"people" + 0.006*"think" + 0.006*"god" + 0.005*"like" + 0.005*"know" + 0.004*"time" + 0.004*"even" + 0.004*"well"')
(3, '0.016*"max" + 0.008*"drive" + 0.007*"one" + 0.006*"would" + 0.006*"get" + 0.006*"like" + 0.006*"new" + 0.005*"use" + 0.005*"card" + 0.005*"car"')
(4, '0.007*"government" + 0.005*"new" + 0.005*"armenian" + 0.005*"israel" + 0.004*"state" + 0.004*"people" + 0.004*"encryption" + 0.004*"key" + 0.004*"states" + 0.004*"university"')


In [23]:
# Save the STM model
stm.save("test_data/newsgroup_stm")

In [24]:
# Load the now saved STM
stm = StmModel.load("test_data/newsgroup_stm")

# Classify the documents in the test set using the loaded LDA model
corpus_test = [dictionary.doc2bow(text) for text in processed_test_text]
doc_topics_test = [stm.get_document_topics(doc) for doc in corpus_test]

# Display topics for the first five test documents (as an example)
print("Topics for the first five test documents:")
for i in range(5):
    print(f"Document {i}:")

    # Lets print both the original text (first 100 characters) and the topic distribution
    print(f"{test_text[i].strip()[:100]}...\n{doc_topics_test[i]}\n")

Topics for the first five test documents:
Document 0:
The runner can leave his base at any time.  If the ball is caught,
he's got to tag up.  If it isn't ...
[(2, 0.9815148)]

Document 1:
Well, it's not an FTP site, but I got an 800 number for Signetics BBS.

The Signetics BBS contain so...
[(1, 0.95122993), (3, 0.036857087)]

Document 2:
Hi,
    I was reading through "The Spaceflight Handbook" and somewhere in
there the author discusses...
[(0, 0.97550154), (4, 0.010504319)]

Document 3:
I was a graduate student in the early 1980s, and we had a conference on 
Reaganomics where Jerry Jor...
[(2, 0.49791273), (4, 0.49355567)]

Document 4:
FREE-ENERGY TECHNOLOGY
                       by Robert E. McElwaine, Physicist
          
         ...
[(0, 0.70560914), (1, 0.012896208), (2, 0.080270626), (3, 0.07121902), (4, 0.13000502)]

