In [1]:
import numpy as np
import pandas as pd
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from py_stm.stm import StmModel
from nltk.corpus import stopwords
import nltk
from sklearn.model_selection import train_test_split

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim_models


### Poliblogs
A collection of blogposts about American politics written in 2008, from the CMU 2008 POlitical Blog Corpus (Einstein and Xing 2010).

Collected from 6 different blogs:
* American Thinker
* Digby
* Hot Air
* Michelle Malkin
* Think Progress
* Talking Points Memo

Each blogpost has metadata available on the day it was written and the political ideology of the blog for which it was written.

In [2]:
poliblogs = pd.read_csv("test_data/poliblogs2008.csv", )
poliblogs = poliblogs.loc[:, ~poliblogs.columns.str.contains('^Unnamed')]

poliblogs.head()

Unnamed: 0,documents,docname,rating,day,blog
0,"After a week of false statements, lies, and di...",at0800300_1.text,Conservative,3,at
1,I honestly don't know how either party's caucu...,at0800300_2.text,Conservative,3,at
2,While we stand in awe of the willingness of ou...,at0800300_3.text,Conservative,3,at
3,These pages recently said goodbye to global wa...,at0800300_4.text,Conservative,3,at
4,A US report shows how the enemy controlled the...,at0800300_5.text,Conservative,3,at


In [3]:
print(f"There are {len(poliblogs)} many documents in the poliblogs dataset")

There are 13246 many documents in the poliblogs dataset


In [4]:
nltk.download('stopwords')  # Download the stopwords

[nltk_data] Downloading package stopwords to C:\Users\Tyler
[nltk_data]     Holston\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [5]:
def preprocess_text(text):
    tokens = simple_preprocess(text, deacc=True, min_len=3)  # deacc=True removes punctuation
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token.isalnum() and token not in stop_words]
    return filtered_tokens

In [6]:
# Lets split the text here to demonstrate model save/load functionality later
train_text, test_text = train_test_split(poliblogs.documents, test_size=0.2, random_state=42)

# Apply the preprocessing function to the text data
processed_train_text = [preprocess_text(text) for text in train_text]
processed_test_text = [preprocess_text(text) for text in test_text]

# Create the training dictionary
dictionary = Dictionary(processed_train_text)

# Filter extremes (remove tokens that appear in less than 10 documents, or more than 50% of the documents)
dictionary.filter_extremes(no_below=10, no_above=0.5)

# Create the training corpus (bag of words representation)
corpus_train = [dictionary.doc2bow(text) for text in processed_train_text]

In [7]:
# Train the STM model
num_topics = 5  # Define the number of topics you want to extract
stm = StmModel(corpus_train, num_topics=num_topics, id2word=dictionary, passes=2, random_state=42)

In [8]:
topics = stm.print_topics()
for topic in topics:
    print(topic)

(0, '0.011*"iraq" + 0.008*"war" + 0.007*"bush" + 0.006*"said" + 0.005*"military" + 0.005*"president" + 0.004*"iran" + 0.004*"american" + 0.004*"government" + 0.004*"israel"')
(1, '0.004*"government" + 0.004*"house" + 0.004*"said" + 0.004*"new" + 0.004*"congress" + 0.004*"health" + 0.004*"bill" + 0.004*"federal" + 0.004*"bush" + 0.003*"president"')
(2, '0.034*"mccain" + 0.013*"campaign" + 0.009*"said" + 0.009*"palin" + 0.008*"john" + 0.006*"barack" + 0.006*"president" + 0.006*"hillary" + 0.006*"clinton" + 0.005*"bush"')
(3, '0.010*"mccain" + 0.006*"new" + 0.006*"democrats" + 0.004*"campaign" + 0.004*"democratic" + 0.004*"hillary" + 0.004*"vote" + 0.004*"economy" + 0.004*"election" + 0.004*"get"')
(4, '0.005*"like" + 0.005*"people" + 0.004*"political" + 0.004*"even" + 0.003*"right" + 0.003*"time" + 0.003*"new" + 0.003*"know" + 0.003*"think" + 0.003*"media"')


In [9]:
pyLDAvis.enable_notebook()
vis_poliblogs = pyLDAvis.gensim_models.prepare(stm, corpus_train, stm.id2word, mds='mmds')
vis_poliblogs

