In [1]:
import pandas as pd
import re
from collections import Counter
import little_mallet_wrapper
from pathlib import Path
import datetime

In [140]:
# Change number of topics for the model
num_topics = 30
output_suffix = 'all-30'

In [15]:
# Importing WSJ data
filePath = "./WSJournal/scraped-wsj-data/WSJ_2020-1-1_2020-12-31.csv"
originalWSJ = pd.read_csv(filePath)
# Importing NYT data
fileLocation = "./NYTimes/articles.csv"
originalNYT = pd.read_csv(fileLocation)
originalNYT = originalNYT.rename(columns={"Headline": "Headline", "text": "Text"})

In [16]:
originalWSJ.head()

Unnamed: 0,Date,Headline,Type,URL,Text
0,2020/01/01,Why Would Elizabeth Warren Want More Banks?,Commentary,https://www.wsj.com/articles/why-would-elizabe...,"America is losing too many banks, according to..."
1,2020/01/01,‘Hate Crime’ Is Only a Step Away From Thoughtc...,Commentary,https://www.wsj.com/articles/hate-crime-is-onl...,Does it make sense that a person can burn an A...
2,2020/01/01,Latin America’s ‘Oasis’ Descends Into Chaos,Commentary,https://www.wsj.com/articles/latin-americas-oa...,"Chile—Latin America’s freest, most stable and ..."
3,2020/01/01,Gertrude Himmelfarb,Review & Outlook,https://www.wsj.com/articles/gertrude-himmelfa...,She was an accomplished historian known for ri...
4,2020/01/02,Warren Zevon’s Wisdom for the 2020s,Declarations,https://www.wsj.com/articles/warren-zevons-wis...,I bumped into a great artist on the morning of...


In [17]:
originalNYT.head()

Unnamed: 0,Headline,Text
0,A very American story about capitalism consumi...,Why is the United States running out of face m...
1,We don’t need any more novels or TV shows abou...,It’s happening again. It took a fraudulent 911...
2,Mitch McConnell has a tricky needle to thread.txt,"Mitch McConnell, the Senate majority leader, d..."
3,Residents are nervous and exhausted It’s not t...,Gov. Andrew Cuomo and Mayor Bill de Blasio hav...
4,Here are seven reasons your “coronavirus party...,"As the coronavirus continues to spread, epidem..."


In [18]:
subsetWSJ = originalWSJ.drop(columns=["Date", "Type", "URL"])
combination = originalNYT.append(subsetWSJ, ignore_index=True)


In [25]:
sentencesNYT = originalNYT["Text"]
textWSJ = originalWSJ["Text"].values.tolist()
sentencesWSJ = [t.split('\n')[0] for t in textWSJ]
sentences = combination["Text"]

In [26]:
combination.head()

Unnamed: 0,Headline,Text
0,A very American story about capitalism consumi...,Why is the United States running out of face m...
1,We don’t need any more novels or TV shows abou...,It’s happening again. It took a fraudulent 911...
2,Mitch McConnell has a tricky needle to thread.txt,"Mitch McConnell, the Senate majority leader, d..."
3,Residents are nervous and exhausted It’s not t...,Gov. Andrew Cuomo and Mayor Bill de Blasio hav...
4,Here are seven reasons your “coronavirus party...,"As the coronavirus continues to spread, epidem..."


In [27]:
trainWSJ = [little_mallet_wrapper.process_string(text) for text in sentencesWSJ]
trainNYT = [little_mallet_wrapper.process_string(text) for text in sentencesNYT]
train = [little_mallet_wrapper.process_string(text) for text in sentences]


In [141]:
path_to_mallet = 'C:/mallet-2.0.8/bin/mallet'
training_data = train

#Change to your desired output directory
output_directory_path = './topic-model-outputs/' + output_suffix

#No need to change anything below here
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model                   = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/{str(num_topics)}"

In [142]:
#Importing Data
little_mallet_wrapper.import_data(path_to_mallet,
                path_to_training_data,
                path_to_formatted_training_data,
                training_data)

Importing data...
Complete


In [143]:
#Training Data
little_mallet_wrapper.train_topic_model(path_to_mallet,
                      path_to_formatted_training_data,
                      path_to_model,
                      path_to_topic_keys,
                      path_to_topic_distributions,
                      num_topics)

Training topic model...
Complete


In [144]:
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)
topicsDF = pd.DataFrame(topics)
topicsDF.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,world,pandemic,crisis,economic,global,coronavirus,countries,america,government,economy,response,president,mexico,international,national,country,like,power,could,health
1,NUM,fed,debt,financial,banks,treasury,loans,market,economy,money,bank,interest,federal,markets,rates,reserve,credit,central,crisis,policy
2,companies,business,company,new,administration,rules,one,financial,services,government,rule,private,data,investors,corporate,market,must,make,social,copyright
3,black,white,american,racial,NUM,america,race,americans,racism,history,people,diversity,women,blacks,identity,country,culture,racist,justice,men
4,NUM,fbi,trump,general,department,investigation,former,justice,attorney,intelligence,barr,evidence,case,campaign,flynn,report,russian,officials,president,russia


In [145]:
topicsDF.to_csv('./results/' + output_suffix + '-topics.csv', index=False)
