# Analyzing prominent topics in NYT and WSJ opinion articles

## Part 1: Training topic models
This notebook uses data that has already been scraped and placed into their appropriate folders in `./NYTimes` and `./WSJournal`

Topics from this part are saved to `results`, and the topic model outputs are saved to `topic-model-outputs`.

In [1]:
import pandas as pd
import re
from collections import Counter
import little_mallet_wrapper
from pathlib import Path
import datetime
import glob
import os

In [41]:
# Change number of topics & suffix name for the model
num_topics = 30
output_suffix = 'all-30'

In [4]:
# Importing WSJ data
filePath = "./WSJournal/scraped-wsj-data/WSJ_2020-1-1_2020-12-31.csv"
originalWSJ = pd.read_csv(filePath)
# Importing NYT data
fileLocation = "./NYTimes/articles.csv"
originalNYT = pd.read_csv(fileLocation)
originalNYT = originalNYT.rename(columns={"Headline": "Headline", "text": "Text"})

In [5]:
# Preview imported data
originalWSJ.head()

Unnamed: 0,Date,Headline,Type,URL,Text
0,2020/01/01,Why Would Elizabeth Warren Want More Banks?,Commentary,https://www.wsj.com/articles/why-would-elizabe...,"America is losing too many banks, according to..."
1,2020/01/01,‘Hate Crime’ Is Only a Step Away From Thoughtc...,Commentary,https://www.wsj.com/articles/hate-crime-is-onl...,Does it make sense that a person can burn an A...
2,2020/01/01,Latin America’s ‘Oasis’ Descends Into Chaos,Commentary,https://www.wsj.com/articles/latin-americas-oa...,"Chile—Latin America’s freest, most stable and ..."
3,2020/01/01,Gertrude Himmelfarb,Review & Outlook,https://www.wsj.com/articles/gertrude-himmelfa...,She was an accomplished historian known for ri...
4,2020/01/02,Warren Zevon’s Wisdom for the 2020s,Declarations,https://www.wsj.com/articles/warren-zevons-wis...,I bumped into a great artist on the morning of...


In [6]:
originalNYT.head()

Unnamed: 0,Headline,Text
0,A very American story about capitalism consumi...,Why is the United States running out of face m...
1,We don’t need any more novels or TV shows abou...,It’s happening again. It took a fraudulent 911...
2,Mitch McConnell has a tricky needle to thread.txt,"Mitch McConnell, the Senate majority leader, d..."
3,Residents are nervous and exhausted It’s not t...,Gov. Andrew Cuomo and Mayor Bill de Blasio hav...
4,Here are seven reasons your “coronavirus party...,"As the coronavirus continues to spread, epidem..."


In [7]:
# Retrieve sentences (from first paragraphs in articles) that can be used for training
sentencesNYT = originalNYT["Text"]
textWSJ = originalWSJ["Text"].values.tolist()
sentencesWSJ = [t.split('\n')[0] for t in textWSJ]


In [8]:
# Merging the corpus of NYT and WSJ to create a combined corpus
subsetWSJ = pd.DataFrame(zip(originalWSJ["Headline"], sentencesWSJ), columns=['Headline', 'Text'])
subsetNYT = originalNYT.copy(deep=True)
subsetNYT["Source"] = 'NYT'
subsetWSJ["Source"] = 'WSJ'
combination = subsetNYT.append(subsetWSJ, ignore_index=True)
sentences = combination["Text"]
combination.head()

Unnamed: 0,Headline,Text,Source
0,A very American story about capitalism consumi...,Why is the United States running out of face m...,NYT
1,We don’t need any more novels or TV shows abou...,It’s happening again. It took a fraudulent 911...,NYT
2,Mitch McConnell has a tricky needle to thread.txt,"Mitch McConnell, the Senate majority leader, d...",NYT
3,Residents are nervous and exhausted It’s not t...,Gov. Andrew Cuomo and Mayor Bill de Blasio hav...,NYT
4,Here are seven reasons your “coronavirus party...,"As the coronavirus continues to spread, epidem...",NYT


In [9]:
trainWSJ = [little_mallet_wrapper.process_string(text) for text in sentencesWSJ]
trainNYT = [little_mallet_wrapper.process_string(text) for text in sentencesNYT]
train = [little_mallet_wrapper.process_string(text) for text in sentences]


In [32]:
path_to_mallet = 'C:/mallet-2.0.8/bin/mallet'
training_data = train

#Change to your desired output directory
output_directory_path = './topic-model-outputs/' + output_suffix

#No need to change anything below here
Path(f"{output_directory_path}").mkdir(parents=True, exist_ok=True)

path_to_training_data           = f"{output_directory_path}/training.txt"
path_to_formatted_training_data = f"{output_directory_path}/mallet.training"
path_to_model                   = f"{output_directory_path}/mallet.model.{str(num_topics)}"
path_to_topic_keys              = f"{output_directory_path}/mallet.topic_keys.{str(num_topics)}"
path_to_topic_distributions     = f"{output_directory_path}/{str(num_topics)}"

In [42]:
#Importing Data
little_mallet_wrapper.import_data(path_to_mallet,
                path_to_training_data,
                path_to_formatted_training_data,
                training_data)

Importing data...
Complete


In [43]:
#Training Data
little_mallet_wrapper.train_topic_model(path_to_mallet,
                      path_to_formatted_training_data,
                      path_to_model,
                      path_to_topic_keys,
                      path_to_topic_distributions,
                      num_topics)

Training topic model...
Complete


In [44]:
# Preview topics
topics = little_mallet_wrapper.load_topic_keys(path_to_topic_keys)
topicsDF = pd.DataFrame(topics)
topicsDF.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,NUM,students,next,university,future,week,best,ask,college,night,words,view,note,published,fewer,responses,education,opinions,editor,submit
1,people,like,even,political,way,want,time,make,know,get,think,might,need,seem,media,often,american,right,good,doesn
2,country,oil,climate,energy,california,change,policies,gas,move,bret,gail,including,new,natural,industry,power,plan,recent,good,put
3,coronavirus,world,china,pandemic,crisis,global,chinese,hong,kong,beijing,america,economic,covid,virus,democracy,health,leadership,americans,may,war
4,democratic,party,sanders,presidential,bernie,political,tuesday,campaign,primary,nomination,democrats,bloomberg,candidate,debate,win,left,clinton,would,voters,iowa


In [45]:
# save the generated topics. uncomment to run
topicsDF.to_csv('./results/' + output_suffix + '-topics.csv', index=False)

## Part 2: Visualization of data for analysis
### Word clouds
We'll attempt to make word clouds with the topics created, having the frequency of a word in the corpus reflected as its size.

In [15]:
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.dates as dates
%matplotlib inline

In [160]:
# splitting text into all lowercase
def split_into_words(any_chunk_of_text):
    '''any_chunk_of_text: iterable of texts'''
    lowercase_text = [s.lower() for s in any_chunk_of_text]
    lowercase_text = ' '.join(lowercase_text)
    split_words = re.split("\W+", lowercase_text)
    return split_words


In [161]:
# Retrieve frequency of words in topics
def retrieve_frequency_dict(topic_terms, all_the_words):
    investigated_words = [word for word in all_the_words if word in topic_terms]
    return Counter(investigated_words)

In [184]:
# Function to make wordcloud, from https://github.com/khuyentran1401/Data-science/blob/master/nlp/linkedin_analysis/message_analysis.ipynb
def make_wordcloud(new_text, fig_name):
    wordcloud = WordCloud(width = 800, height = 800,
                min_font_size = 10,
                background_color='black',
                colormap='Set2',
                collocations=False).generate_from_frequencies(new_text)

    fig = plt.figure(figsize = (8, 8), facecolor = None)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.tight_layout(pad = 0)
    plt.savefig('./wordclouds/' + fig_name + '.png', dpi=300, bbox_inches='tight')
    plt.cla()
    plt.close(fig)

In [169]:
# Prepare data for wordclouds
topicFiles = glob.glob("./results/*.csv")
wordsNYT = split_into_words(sentencesNYT)
wordsWSJ = split_into_words(sentencesWSJ)
words = split_into_words(sentences)

In [187]:
# Iterate through topics and generate wordclouds
for filename in topicFiles:
    corpus = filename[10:13]
    if corpus == 'all':
        dataCloud = words
    elif corpus == 'NYT':
        continue
        # dataCloud = wordsNYT
    elif corpus == 'WSJ':
        continue
        # dataCloud = wordsWSJ
    currTopics = pd.read_csv(filename)
    for topicIdx in range(len(currTopics)):
        currTopic = currTopics.iloc[topicIdx].values.tolist()
        currDict = retrieve_frequency_dict(currTopic, dataCloud)
        fig_name = filename[10:16] + '/' + str(topicIdx)
        make_wordcloud(currDict, fig_name)

## Topic distribution
This attempts to investigate the distribution of topics in a combined corpus of both NYT & WSJ articles

In [46]:
topic_distributions = little_mallet_wrapper.load_topic_distributions(path_to_topic_distributions)
train_with_source = dict(zip(training_data, combination["Source"].values.tolist()))

In [47]:
def source_distribution_per_topic(topic_number=0, number_of_documents=5):
    sources = []
    for probability, document in little_mallet_wrapper.get_top_docs(training_data, topic_distributions, topic_number, n=number_of_documents):
        if probability >= 0.5:
            sources.append(train_with_source[document])
    count = Counter(sources)
    return count

In [48]:
topics_dict = {}
for number in range(num_topics):
    source_count = source_distribution_per_topic(number, len(combination))
    wsj_count = source_count["WSJ"]
    nyt_count = source_count["NYT"]
    sum_count = wsj_count + nyt_count
    if sum_count == 0:
        topics_dict[number] = {"WSJ": 0, "NYT": 0}
        continue
    wsj_percent = wsj_count / sum_count
    nyt_percent = nyt_count / sum_count
    topics_dict[number] = {"WSJ": wsj_percent, "NYT": nyt_percent}


In [49]:
topics_dict_df = pd.DataFrame.from_dict(topics_dict, orient='index')
topics_dict_df.to_csv('./source_distribution/' + output_suffix + '-source-dist.csv', index=False)
topics_dict_df.head()




Unnamed: 0,WSJ,NYT
0,0.980392,0.019608
1,0.5,0.5
2,0.5,0.5
3,0.583333,0.416667
4,0.727273,0.272727
