In [3]:
import pandas as pd
import re
from collections import Counter
import little_mallet_wrapper
import glob
from pathlib import Path

In [4]:
# Importing WSJ data
filePath = "./scraped-wsj-data/WSJ_2020-11-1_2020-11-30.csv"
data = pd.read_csv(filePath)
data.head(3)

Unnamed: 0,Date,Headline,Type,URL,Text
0,2020/11/01,How Congress Can Get Kids Back to Class,Commentary,https://www.wsj.com/articles/how-congress-can-...,"The U.K., France and other European countries ..."
1,2020/11/01,Voters and the Other Supreme Courts,Commentary,https://www.wsj.com/articles/voters-and-the-ot...,"The Supreme Court is on Tuesday’s ballot, lite..."
2,2020/11/01,Trump Has Torn the Mask Off the Liberal Media,Commentary,https://www.wsj.com/articles/trump-has-torn-th...,Why does the media hate Donald Trump so much? ...


In [5]:
# Select the first paragraph of text
full_text = data["Text"].values.tolist()
first_paragraph = [t.split('\n')[0] for t in data["Text"]]
print(first_paragraph[:5])

['The U.K., France and other European countries are closing businesses in another wave of lock downs, as Covid-19 cases and hospitalizations rise. But unlike in the spring, schools are open. Infections are also surging across the U.S., where in many places in-person classrooms are closed and distance learning has become the norm. But there’s growing evidence schools can open safely, with reasonable precautions and, perhaps soon, regular testing. ', 'The Supreme Court is on Tuesday’s ballot, literally in some states. Most state high-court judges don’t have lifetime tenure and have to face voters periodically for reconfirmation or re-election. On this week’s ballot are 66 high-court seats in 31 states, including three in North Carolina and two each in Georgia, Michigan and Ohio. Democrats hope to unseat conservatives and gain control of Ohio’s high court for the first time since 1993 and Michigan’s since 2010. ', 'Why does the media hate Donald Trump so much? There are all kinds of theor

In [6]:
training_data = [little_mallet_wrapper.process_string(text) for text in first_paragraph]

In [9]:
# splitting text into all lowercase
def split_into_words(any_chunk_of_text):
    lowercase_text = [s.lower() for s in any_chunk_of_text]
    lowercase_text = ''.join(lowercase_text)
    split_words = re.split("\W+", lowercase_text)
    return split_words

In [10]:
number_of_desired_words = 10
# Manipulate and Analyze File
stopwords = ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours',
'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers',
 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves',
 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does',
 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until',
 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into',
 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down',
 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here',
 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more',
 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so',
 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 've', 'll', 'amp']


all_the_words = split_into_words(first_paragraph)
meaningful_words = [word for word in all_the_words if word not in stopwords]
meaningful_words_tally = Counter(meaningful_words)
most_frequent_meaningful_words = meaningful_words_tally.most_common(number_of_desired_words)

In [19]:
path_to_mallet = 'C:/mallet-2.0.8/bin/mallet'
num_topics = 5

#Change to your desired output directory
output_directory_path = './topic-model-output'

#No need to change anything below here
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model                   = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/{str(num_topics)}"


In [20]:
#Importing Data
little_mallet_wrapper.import_data(path_to_mallet,
                path_to_training_data,
                path_to_formatted_training_data,
                training_data)

Importing data...
Complete


In [21]:
#Training Data
little_mallet_wrapper.train_topic_model(path_to_mallet,
                      path_to_formatted_training_data,
                      path_to_model,
                      path_to_topic_keys,
                      path_to_topic_distributions,
                      num_topics)

Training topic model...
Complete


In [22]:
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)

for topic_number, topic in enumerate(topics):
    print(f"✨Topic {topic_number}✨\n\n{topic}\n")

✨Topic 0✨

['biden', 'president', 'trump', 'election', 'joe', 'NUM', 'states', 'party', 'senate', 'court', 'nov', 'democrats', 'administration', 'donald', 'obama', 'democratic', 'presidential', 'tuesday', 'vote', 'won']

✨Topic 1✨

['new', 'nov', 'state', 'federal', 'government', 'could', 'york', 'regarding', 'gov', 'right', 'two', 'school', 'legal', 'politics', 'support', 'law', 'city', 'long', 'schools', 'home']

✨Topic 2✨

['NUM', 'covid', 'many', 'pandemic', 'people', 'media', 'oct', 'nov', 'health', 'also', 'world', 'news', 'country', 'vaccine', 'case', 'care', 'day', 'point', 'make', 'business']

✨Topic 3✨

['NUM', 'year', 'time', 'voters', 'first', 'political', 'may', 'like', 'even', 'since', 'another', 'well', 'tax', 'million', 'years', 'still', 'monday', 'way', 'past', 'california']

✨Topic 4✨

['one', 'would', 'american', 'america', 'years', 'policy', 'china', 'democracy', 'much', 'made', 'world', 'americans', 'back', 'even', 'end', 'trade', 'national', 'want', 'book', 'secur