## Example of STM in use
This is different than poliblogs.ipynb because we are demonstrating use of metadata

In [1]:
import numpy as np
import pandas as pd
from patsy import dmatrix
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from py_stm.stm import StmModel
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models

In [2]:
poliblogs = pd.read_csv("test_data/poliblogs2008.csv", )
poliblogs = poliblogs.loc[:, ~poliblogs.columns.str.contains('^Unnamed')]

poliblogs.head()

Unnamed: 0,documents,docname,rating,day,blog
0,"After a week of false statements, lies, and di...",at0800300_1.text,Conservative,3,at
1,I honestly don't know how either party's caucu...,at0800300_2.text,Conservative,3,at
2,While we stand in awe of the willingness of ou...,at0800300_3.text,Conservative,3,at
3,These pages recently said goodbye to global wa...,at0800300_4.text,Conservative,3,at
4,A US report shows how the enemy controlled the...,at0800300_5.text,Conservative,3,at


In [3]:
print(f"There are {len(poliblogs)} many documents in the poliblogs dataset")

There are 13246 many documents in the poliblogs dataset


In [4]:
nltk.download('stopwords')  # Download the stopwords

[nltk_data] Downloading package stopwords to C:\Users\Tyler
[nltk_data]     Holston\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def preprocess_text(text, min_len):
    tokens = simple_preprocess(text, deacc=True, min_len=min_len)  # deacc=True removes punctuation
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return filtered_tokens


In [6]:
# Split the text and metadata for training
train_text, test_text, train_metadata, test_metadata = train_test_split(
    poliblogs.documents, poliblogs[['rating', 'day', 'blog']], test_size=0.8, random_state=42
)

# Apply the preprocessing function to the text data
processed_train_text = [preprocess_text(text, min_len=3) for text in train_text]
processed_test_text = [preprocess_text(text, min_len=3) for text in test_text]

# Create the training dictionary
dictionary = Dictionary(processed_train_text)

# Filter extremes (remove tokens that appear in less than 10 documents, or more than 50% of the documents)
dictionary.filter_extremes(no_below=10, no_above=0.5)

# Create the training corpus (bag of words representation)
corpus_train = [dictionary.doc2bow(text) for text in processed_train_text]

In [7]:
from scipy.sparse import csr_matrix
corpus_train[190]
doc_idx, word_idx, count = [], [], []

for i, doc in enumerate(corpus_train):
	for word, freq in doc:
		doc_idx.append(i)
		word_idx.append(word)
		count.append(freq)

a = csr_matrix((count, (doc_idx, word_idx)))

wprob = np.sum(a, axis=0)
wprob = wprob / np.sum(wprob)    
wprob = np.array(wprob)

wprob.flatten()

array([3.46802029e-04, 2.93869088e-04, 7.97644667e-04, ...,
       2.00780122e-05, 2.19032860e-05, 2.55538337e-05])

In [13]:
# A user could precompute the prevalence themselves like so
from patsy import dmatrix

# prevalence = dmatrix("~rating+cr(day, df=3)", data=train_metadata, return_type='dataframe')
prevalence = dmatrix("~rating+cr(day, df=3)", data=train_metadata, return_type='dataframe')
a = prevalence.astype("category")
prevalence

Unnamed: 0,Intercept,rating[T.Liberal],"cr(day, df=3)[0]","cr(day, df=3)[1]","cr(day, df=3)[2]"
11708,1.0,1.0,0.609936,0.461057,-0.070993
7650,1.0,0.0,-0.096484,0.804462,0.292022
8599,1.0,0.0,-0.027219,0.163578,0.863640
7788,1.0,0.0,-0.096609,0.739139,0.357470
3456,1.0,1.0,0.558841,0.519251,-0.078092
...,...,...,...,...,...
11964,1.0,1.0,0.175061,0.907666,-0.082727
5191,1.0,0.0,0.863043,0.163875,-0.026919
5390,1.0,0.0,0.609936,0.461057,-0.070993
860,1.0,0.0,0.310744,0.784581,-0.095325


In [14]:
# Train the STM model
num_topics = 5  # Define the number of topics you want to extract

# PREVALENCE MODEL
#stm = StmModel(corpus_train, num_topics=num_topics, id2word=dictionary, prevalence=prevalence, passes=2, random_state=420) # intended use 1. prevalence matrix precomputed
#stm = StmModel(corpus_train, num_topics=num_topics, id2word=dictionary, metadata=train_metadata, prevalence="~rating+cr(day, df=3)", passes=2, random_state=420) # intended use 2. metadata dataframe with prevalence formula

# CONTENT MODEL
#stm = StmModel(corpus_train, num_topics=num_topics, id2word=dictionary, metadata=train_metadata, content=train_metadata.loc[:, "rating"], passes=2, random_state=420) # intended use 3. metadata dataframe and content formula

# BOTH MODEL
stm = StmModel(corpus_train, num_topics=num_topics, id2word=dictionary, prevalence=prevalence, content=train_metadata.loc[:, "rating"], passes=2, random_state=420)

  self._set_arrayXarray(i, j, x)


In [11]:
stm.label_topics(range(0,5), n=15, print_labels=True)

Topic Words:
Topic 0: mccain, john, said, palin, think, today, new, sen, even, campaign, going, people, like, war, time
Topic 1: bush, iraq, administration, house, war, government, mccain, said, security, new, people, president, like, white, time
Topic 2: president, said, people, like, american, new, time, think, government, years, right, know, war, also, even
Topic 3: campaign, said, new, people, like, barack, time, get, also, know, last, american, palin, even, political
Topic 4: hillary, clinton, even, could, like, new, state, people, two, democrats, time, think, get, may, vote

Covariate Words:
Group Liberal: mccain, like, new, people, even, said, time, two, campaign, barack, get, may, last, government, could
Group Conservative: said, mccain, new, people, president, bush, campaign, think, like, john, iraq, know, right, time, today

Topic-Covariate Interactions:
Topic 0, Group Liberal: mccain, john, even, said, palin, new, two, update, like, people, may, time, also, barack, last 
Top

(array([['mccain', 'john', 'said', 'palin', 'think', 'today', 'new',
         'sen', 'even', 'campaign', 'going', 'people', 'like', 'war',
         'time'],
        ['bush', 'iraq', 'administration', 'house', 'war', 'government',
         'mccain', 'said', 'security', 'new', 'people', 'president',
         'like', 'white', 'time'],
        ['president', 'said', 'people', 'like', 'american', 'new', 'time',
         'think', 'government', 'years', 'right', 'know', 'war', 'also',
         'even'],
        ['campaign', 'said', 'new', 'people', 'like', 'barack', 'time',
         'get', 'also', 'know', 'last', 'american', 'palin', 'even',
         'political'],
        ['hillary', 'clinton', 'even', 'could', 'like', 'new', 'state',
         'people', 'two', 'democrats', 'time', 'think', 'get', 'may',
         'vote']], dtype='<U14'),
 array([['mccain', 'like', 'new', 'people', 'even', 'said', 'time', 'two',
         'campaign', 'barack', 'get', 'may', 'last', 'government',
         'could'],

In [12]:
stm.N = stm.lencorpus
topics = [2, 3, 4] # or range(2, 5)
for topic, docs in zip(topics, stm.find_thoughts(topics, n=6)):
    print(f"Topic: {topic}")
    print(poliblogs.iloc[docs]['documents'])
    print("\n")

Topic: 2
2310    At NationalPost.com, journalist David Frum has...
1811    On July 12th of this year, just two and a half...
682     Thanks to the release of Barack Obama's income...
1375    AT contributors have some further thoughts on ...
2465    What is it with the legacy media? Don't they k...
Name: documents, dtype: object


Topic: 3
2579    Last Sunday, when Joe Biden predicted that a P...
2410    ACORN, responsible for so much vote fraud in W...
219     Defying the expectations of the pundits, John ...
1487    You have to wonder what will be going through ...
1247    Making it clear that she will not allow any co...
Name: documents, dtype: object


Topic: 4
1230    Note to Iran apologists: Even the United Natio...
1959    Last week, Ukrainian President Viktor Yushchen...
115     Continuing his visit to the Middle East, Presi...
861     With distorted glasses on his eyes and hatred ...
1623    How do journalism schools manage to keep train...
Name: documents, dtype: object


