## Example of STM in use
This is different than poliblogs.ipynb because we are demonstrating use of metadata

In [1]:
import numpy as np
import pandas as pd
from patsy import dmatrix
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from py_stm.stm import StmModel
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models

In [2]:
poliblogs = pd.read_csv("test_data/poliblogs2008.csv", )
poliblogs = poliblogs.loc[:, ~poliblogs.columns.str.contains('^Unnamed')]

poliblogs.head()

Unnamed: 0,documents,docname,rating,day,blog
0,"After a week of false statements, lies, and di...",at0800300_1.text,Conservative,3,at
1,I honestly don't know how either party's caucu...,at0800300_2.text,Conservative,3,at
2,While we stand in awe of the willingness of ou...,at0800300_3.text,Conservative,3,at
3,These pages recently said goodbye to global wa...,at0800300_4.text,Conservative,3,at
4,A US report shows how the enemy controlled the...,at0800300_5.text,Conservative,3,at


In [3]:
print(f"There are {len(poliblogs)} many documents in the poliblogs dataset")

There are 13246 many documents in the poliblogs dataset


In [4]:
nltk.download('stopwords')  # Download the stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/tylerholston/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def preprocess_text(text, min_len):
    tokens = simple_preprocess(text, deacc=True, min_len=min_len)  # deacc=True removes punctuation
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return filtered_tokens


In [6]:
# Split the text and metadata for training
train_text, test_text, train_metadata, test_metadata = train_test_split(
    poliblogs.documents, poliblogs[['rating', 'day', 'blog']], test_size=0.8, random_state=42
)

# Apply the preprocessing function to the text data
processed_train_text = [preprocess_text(text, min_len=3) for text in train_text]
processed_test_text = [preprocess_text(text, min_len=3) for text in test_text]

# Create the training dictionary
dictionary = Dictionary(processed_train_text)

# Filter extremes (remove tokens that appear in less than 10 documents, or more than 50% of the documents)
dictionary.filter_extremes(no_below=10, no_above=0.5)

# Create the training corpus (bag of words representation)
corpus_train = [dictionary.doc2bow(text) for text in processed_train_text]

In [7]:
from scipy.sparse import csr_matrix
corpus_train[190]
doc_idx, word_idx, count = [], [], []

for i, doc in enumerate(corpus_train):
	for word, freq in doc:
		doc_idx.append(i)
		word_idx.append(word)
		count.append(freq)

a = csr_matrix((count, (doc_idx, word_idx)))

wprob = np.sum(a, axis=0)
wprob = wprob / np.sum(wprob)    
wprob = np.array(wprob)

wprob.flatten()

array([3.46802029e-04, 2.93869088e-04, 7.97644667e-04, ...,
       2.00780122e-05, 2.19032860e-05, 2.55538337e-05])

In [8]:
# A user could precompute the prevalence themselves like so
from patsy import dmatrix

# prevalence = dmatrix("~rating+cr(day, df=3)", data=train_metadata, return_type='dataframe')
prevalence = dmatrix("~rating", data=train_metadata, return_type='dataframe')
a = prevalence.astype("category")
prevalence

Unnamed: 0,Intercept,rating[T.Liberal]
11708,1.0,1.0
7650,1.0,0.0
8599,1.0,0.0
7788,1.0,0.0
3456,1.0,1.0
...,...,...
11964,1.0,1.0
5191,1.0,0.0
5390,1.0,0.0
860,1.0,0.0


In [9]:
# Train the STM model
num_topics = 5  # Define the number of topics you want to extract

# PREVALENCE MODEL
#stm = StmModel(corpus_train, num_topics=num_topics, id2word=dictionary, prevalence=prevalence, passes=10, random_state=420, chunksize=len(corpus_train)) # intended use 1. prevalence matrix precomputed

# CONTENT MODEL
stm = StmModel(corpus_train, num_topics=num_topics, id2word=dictionary, metadata=train_metadata, content=train_metadata.loc[:, "rating"], passes=10, random_state=420, chunksize=len(corpus_train)) # intended use 3. metadata dataframe and content formula

# BOTH MODEL
#stm = StmModel(corpus_train, num_topics=num_topics, id2word=dictionary, prevalence=prevalence, content=train_metadata.loc[:, "rating"], passes=10, random_state=420, chunksize=len(corpus_train))

  self._set_arrayXarray(i, j, x)


In [13]:
stm.label_topics(range(0,5), n=15, print_labels=True)

Topic Words:
Topic 0: mccain, said, new, people, campaign, john, like, president, bush, time, even, think, get, iraq, also
Topic 1: mccain, bush, said, people, new, iraq, campaign, like, president, time, even, john, get, think, also
Topic 2: mccain, said, president, people, new, like, campaign, time, bush, even, think, american, get, also, john
Topic 3: mccain, campaign, said, new, people, like, time, president, bush, even, get, also, john, think, barack
Topic 4: mccain, said, new, people, like, campaign, time, president, even, bush, think, get, john, also, could

Covariate Words:
Group Liberal: mccain, like, people, new, even, said, campaign, time, two, barack, get, may, last, government, could
Group Conservative: mccain, said, bush, president, campaign, people, new, john, iraq, think, like, know, right, time, today

Topic-Covariate Interactions:
Topic 0, Group Liberal: mccain, like, even, new, people, said, campaign, two, time, barack, get, may, last, government, could 
Topic 0, Grou

(array([['mccain', 'said', 'new', 'people', 'campaign', 'john', 'like',
         'president', 'bush', 'time', 'even', 'think', 'get', 'iraq',
         'also'],
        ['mccain', 'bush', 'said', 'people', 'new', 'iraq', 'campaign',
         'like', 'president', 'time', 'even', 'john', 'get', 'think',
         'also'],
        ['mccain', 'said', 'president', 'people', 'new', 'like',
         'campaign', 'time', 'bush', 'even', 'think', 'american', 'get',
         'also', 'john'],
        ['mccain', 'campaign', 'said', 'new', 'people', 'like', 'time',
         'president', 'bush', 'even', 'get', 'also', 'john', 'think',
         'barack'],
        ['mccain', 'said', 'new', 'people', 'like', 'campaign', 'time',
         'president', 'even', 'bush', 'think', 'get', 'john', 'also',
         'could']], dtype='<U10'),
 array([['mccain', 'like', 'people', 'new', 'even', 'said', 'campaign',
         'time', 'two', 'barack', 'get', 'may', 'last', 'government',
         'could'],
        ['mccain

In [15]:
topics = [1, 2, 3] # or range(2, 5)
for topic, docs in zip(topics, stm.find_thoughts(topics, n=6)):
    print(f"Topic: {topic}")
    print(poliblogs.iloc[docs]['documents'])
    print("\n")

Topic: 1
1925    Stephen Hayes of the Weekly Standard has the s...
1600    The Democratic Party has announced that Barack...
706     As a followup to Andrew Walden's excellent pie...
757     No, it's not a misprint. Officials with the ZA...
1680    This article  (in the New York Times, no less)...
2413    My jaw hit the floor when I heard Biden say th...
Name: documents, dtype: object


Topic: 2
1811    On July 12th of this year, just two and a half...
1970    Little movement has been made by Russia to tak...
2193    Barack Obama apparently wishes his two autobio...
333     The ultraliberal CBC reports a truth that is m...
1711    I just learned of the following letter from Do...
176     Bill and Hillary Clinton are aggressive politi...
Name: documents, dtype: object


Topic: 3
1580    You stay classy, Bill Jeff:Bill Clinton is spe...
1487    You have to wonder what will be going through ...
2410    ACORN, responsible for so much vote fraud in W...
1737    I was interested to see that 

In [None]:
stm.printTopTopics("topicCorr.png", topics=np.array([0, 1, 2, 3, 4]), bbox=(0,0,500,500))

In [None]:
import pyLDAvis
pyLDAvis.enable_notebook()
vis_accommodations = pyLDAvis.gensim_models.prepare(stm, corpus_train, stm.id2word, mds="mmds")
# vis_accommodations.save('visualizations/poliblogs.html') Saving doesn't work right now
vis_accommodations