In [1]:
import pandas as pd
import nltk.data
from os import listdir
from os.path import isfile, join
from nltk.util import bigrams 
from nltk.tokenize import TreebankWordTokenizer
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [2]:
# load in each candidate data
bernie = pd.read_csv("Bernie_Sanders.csv")
biden = pd.read_csv("Joe_Biden.csv")
trump = pd.read_csv("Donald_Trump.csv")

In [3]:
bernie.head()

Unnamed: 0,title,text,media,word_count,candidate_name
0,A Sign of the Times? The Democratic Primary Ha...,Hide highlightingFull TextTranslateUndo Transl...,New York Times,1122.0,Bernie Sanders
1,"Tops in Iowa, Under Attack At Every Turn: [Nat...",Hide highlightingFull TextTranslateUndo Transl...,New York Times,1995.0,Bernie Sanders
2,Gender and War Dominate Debate by 6 Democrats:...,Hide highlightingFull TextTranslateUndo Transl...,New York Times,2154.0,Bernie Sanders
3,"Once Hawking Big-Ticket Ideas, Democrats Refoc...",Hide highlightingFull TextTranslateUndo Transl...,New York Times,1892.0,Bernie Sanders
4,Sanders Gets Endorsement Of Young Climate Grou...,Hide highlightingFull TextTranslateUndo Transl...,New York Times,1005.0,Bernie Sanders


In [4]:
# separate by news outlet
# bernie
bernie_NYT = bernie.loc[bernie['media']=='New York Times']
bernie_TWP = bernie.loc[bernie['media']=='The Washington Post']
bernie_WSJ = bernie.loc[bernie['media']=='Wall Street Journal']

In [5]:
# biden
biden_NYT = biden.loc[biden['media']=='New York Times']
biden_TWP = biden.loc[biden['media']=='The Washington Post']
biden_WSJ = biden.loc[biden['media']=='Wall Street Journal']

In [6]:
# trump
trump_NYT = trump.loc[trump['media']=='New York Times']
trump_TWP = trump.loc[trump['media']=='The Washington Post']
trump_WSJ = trump.loc[trump['media']=='Wall Street Journal']

In [54]:
# merge each news outlet together
NYT = pd.concat([bernie_NYT, biden_NYT, trump_NYT], axis=0)
TWP = pd.concat([bernie_TWP, biden_TWP, trump_TWP], axis=0)
WSJ = pd.concat([bernie_WSJ, biden_WSJ, trump_WSJ], axis=0)

In [7]:
# extract text
bernie_text = bernie.iloc[:,1]

biden_text = biden.iloc[:,1]

trump_text = trump.iloc[:,1]

In [8]:
#extract text for each candidate each news company
bernie_NYT_text = bernie_NYT.iloc[:,1]
biden_NYT_text = biden_NYT.iloc[:,1]
trump_NYT_text = trump_NYT.iloc[:,1]

bernie_TWP_text = bernie_TWP.iloc[:,1]
biden_TWP_text = biden_TWP.iloc[:,1]
trump_TWP_text = trump_TWP.iloc[:,1]

bernie_WSJ_text = bernie_WSJ.iloc[:,1]
biden_WSJ_text = biden_WSJ.iloc[:,1]
trump_WSJ_text = trump_WSJ.iloc[:,1]

In [55]:
# extract text for each news outlet
NYT_text = NYT.iloc[:,1]
TWP_text = TWP.iloc[:,1]
WSJ_text = WSJ.iloc[:,1]

In [9]:
# take a look at what is actually in the text
bernie_text.head()

0    Hide highlightingFull TextTranslateUndo Transl...
1    Hide highlightingFull TextTranslateUndo Transl...
2    Hide highlightingFull TextTranslateUndo Transl...
3    Hide highlightingFull TextTranslateUndo Transl...
4    Hide highlightingFull TextTranslateUndo Transl...
Name: text, dtype: object

In [10]:
bernie_NYT_text.head()

0    Hide highlightingFull TextTranslateUndo Transl...
1    Hide highlightingFull TextTranslateUndo Transl...
2    Hide highlightingFull TextTranslateUndo Transl...
3    Hide highlightingFull TextTranslateUndo Transl...
4    Hide highlightingFull TextTranslateUndo Transl...
Name: text, dtype: object

In [11]:
# words I have found in the beggining & end of each text due to scraping
remove = ['hide',
 'highlightingfull',
 'texttranslateundo',
 'translation',
 'fromtotranslatetranslation',
 'progress',
 'missing',
 'key',
 'loadinganimation',
 'full',
 'text',
 'may',
 'take',
 'second',
 'translate',
 'larger',
 'document',
 'may',
 'take',
 'longer',
 'cancel',
 'overlayendturn',
 'search',
 'term',
 'navigationturn',
 'navigation',
 'jump',
 'first',
 'hit','article',
 'write',
 'julie',
 'bykowicz',
 'credit',
 'julie',
 'bykowicz',
 'word',
 'count',
 'lessyou',
 'requested',
 'machine',
 'selected',
 'content',
 'database',
 'functionality',
 'provided',
 'solely',
 'convenience',
 'way',
 'intended',
 'replace',
 'human',
 'show',
 'disclaimerneither',
 'proquest',
 'licensors',
 'make',
 'representation',
 'warranty',
 'respect',
 'automatically',
 'generated',
 'available',
 'retained',
 'system',
 'proquest',
 'licensors',
 'specifically',
 'disclaim',
 'express',
 'implied',
 'warranty',
 'including',
 'without',
 'limitation',
 'warranty',
 'availability',
 'accuracy',
 'timeliness',
 'completeness',
 'merchantability',
 'fitness',
 'particular',
 'purpose',
 'use',
 'subject',
 'use',
 'restriction',
 'contained',
 'electronic',
 'product',
 'license',
 'agreement',
 'using',
 'functionality',
 'agree',
 'forgo',
 'claim',
 'proquest',
 'licensors',
 'use',
 'functionality',
 'output',
 'derived',
 'disclaimer',
 'rather',
 'keep',
 'waiting',
 'translated',
 'paragraph',
 'click',
 'button',
 'want',
 'rest',
 'allcopyright',
 'dow',
 'jones',
 'company',
 'right',
 'reserved', 'said', 'ha', 'wa', 'biden', 'trump', 'sander', 'would','new', 'two', 'one']

In [12]:
# include removed words in stop words
stopwords = stop_words + remove

In [13]:
# function to pre-process data

def pre_process_text(text):

    # split into tokens
    #tokens = treebank_tokenizer.tokenize(text)
    
    # take out puncutation and numbers
    words = [word for word in text if word.isalpha()]
    
    # convert to lower case
    words_l = [word.lower() for word in words]
    
    # get lemmas
    lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in words_l]
    
    # taking out stop_words + remove
    output = [w for w in lemmatized_words if w not in stopwords]
    
    # the later function assumes you are returning a list of terms
    return output

In [14]:
# create list of tokenized articles for each candidate

bernie_tokens = []
for news in bernie_text[:]:
    content = nltk.word_tokenize(news)
    bernie_tokens.append(pre_process_text(content))

biden_tokens = []
for news in biden_text[:]:
    content = nltk.word_tokenize(news)
    biden_tokens.append(pre_process_text(content))

trump_tokens = []
for news in trump_text[:]:
    content = nltk.word_tokenize(news)
    trump_tokens.append(pre_process_text(content))

In [15]:
# create list of tokenized articles for each candidate NYT

bernie_NYT_tokens = []
for news in bernie_NYT_text[:]:
    content = nltk.word_tokenize(news)
    bernie_NYT_tokens.append(pre_process_text(content))

biden_NYT_tokens = []
for news in biden_NYT_text[:]:
    content = nltk.word_tokenize(news)
    biden_NYT_tokens.append(pre_process_text(content))

trump_NYT_tokens = []
for news in trump_NYT_text[:]:
    content = nltk.word_tokenize(news)
    trump_NYT_tokens.append(pre_process_text(content))

In [16]:
# create list of tokenized articles for each candidate TWP

bernie_TWP_tokens = []
for news in bernie_TWP_text[:]:
    content = nltk.word_tokenize(news)
    bernie_TWP_tokens.append(pre_process_text(content))

biden_TWP_tokens = []
for news in biden_TWP_text[:]:
    content = nltk.word_tokenize(news)
    biden_TWP_tokens.append(pre_process_text(content))

trump_TWP_tokens = []
for news in trump_TWP_text[:]:
    content = nltk.word_tokenize(news)
    trump_TWP_tokens.append(pre_process_text(content))

In [17]:
# create list of tokenized articles for each candidate WSJ

bernie_WSJ_tokens = []
for news in bernie_WSJ_text[:]:
    content = nltk.word_tokenize(news)
    bernie_WSJ_tokens.append(pre_process_text(content))

biden_WSJ_tokens = []
for news in biden_WSJ_text[:]:
    content = nltk.word_tokenize(news)
    biden_WSJ_tokens.append(pre_process_text(content))

trump_WSJ_tokens = []
for news in trump_WSJ_text[:]:
    content = nltk.word_tokenize(news)
    trump_WSJ_tokens.append(pre_process_text(content))

In [56]:
# create list of tokenized articles for each news outlet

NYT_tokens = []
for news in NYT_text[:]:
    content = nltk.word_tokenize(news)
    NYT_tokens.append(pre_process_text(content))
    
TWP_tokens = []
for news in TWP_text[:]:
    content = nltk.word_tokenize(news)
    TWP_tokens.append(pre_process_text(content))
    
WSJ_tokens = []
for news in WSJ_text[:]:
    content = nltk.word_tokenize(news)
    WSJ_tokens.append(pre_process_text(content))

In [18]:
# check out list of token articles
bernie_tokens[0:5]

[['hitas',
  'primary',
  'race',
  'becomes',
  'battle',
  'field',
  'suddenly',
  'seems',
  'constricting',
  'divisiveness',
  'negative',
  'attack',
  'tagged',
  'rival',
  'nickname',
  'like',
  'sleepy',
  'liddle',
  'push',
  'come',
  'shove',
  'democratic',
  'primary',
  'candidate',
  'surrogate',
  'descended',
  'level',
  'personal',
  'animus',
  'rare',
  'era',
  'president',
  'monday',
  'joseph',
  'launched',
  'digital',
  'ad',
  'south',
  'carolina',
  'saying',
  'bernie',
  'ca',
  'trusted',
  'weighing',
  'primary',
  'president',
  'barack',
  'pete',
  'buttigieg',
  'attacked',
  'victory',
  'nevada',
  'mayor',
  'bill',
  'de',
  'blasio',
  'york',
  'lectured',
  'buttigieg',
  'twitter',
  'saying',
  'smug',
  'got',
  'kicked',
  'lest',
  'last',
  'week',
  'democratic',
  'debate',
  'recede',
  'history',
  'five',
  'day',
  'ago',
  'memorable',
  'searing',
  'moment',
  'highly',
  'personal',
  'exchange',
  'buttigieg',
  'amy'

In [19]:
bernie_WSJ_tokens[0:5]

[['increasingly',
  'strict',
  'measure',
  'designed',
  'slow',
  'spread',
  'coronavirus',
  'raising',
  'concern',
  'feasibility',
  'holding',
  'remaining',
  'democratic',
  'presidential',
  'primary',
  'contest',
  'well',
  'logistics',
  'holding',
  'november',
  'general',
  'election',
  'pandemic',
  'still',
  'nation',
  'election',
  'complicated',
  'thicket',
  'state',
  'federal',
  'law',
  'process',
  'selecting',
  'presidential',
  'nominee',
  'complicated',
  'fact',
  'political',
  'party',
  'private',
  'organization',
  'set',
  'internal',
  'rule',
  'depending',
  'jurisdiction',
  'state',
  'official',
  'party',
  'leader',
  'flexibility',
  'altering',
  'date',
  'possibly',
  'format',
  'remaining',
  'presidential',
  'primary',
  'contest',
  'response',
  'growing',
  'pandemic',
  'expert',
  'say',
  'five',
  'state',
  'already',
  'done',
  'ohio',
  'brief',
  'struggle',
  'governor',
  'state',
  'supreme',
  'court',
  'voti

In [57]:
WSJ_tokens[0:5]

[['increasingly',
  'strict',
  'measure',
  'designed',
  'slow',
  'spread',
  'coronavirus',
  'raising',
  'concern',
  'feasibility',
  'holding',
  'remaining',
  'democratic',
  'presidential',
  'primary',
  'contest',
  'well',
  'logistics',
  'holding',
  'november',
  'general',
  'election',
  'pandemic',
  'still',
  'nation',
  'election',
  'complicated',
  'thicket',
  'state',
  'federal',
  'law',
  'process',
  'selecting',
  'presidential',
  'nominee',
  'complicated',
  'fact',
  'political',
  'party',
  'private',
  'organization',
  'set',
  'internal',
  'rule',
  'depending',
  'jurisdiction',
  'state',
  'official',
  'party',
  'leader',
  'flexibility',
  'altering',
  'date',
  'possibly',
  'format',
  'remaining',
  'presidential',
  'primary',
  'contest',
  'response',
  'growing',
  'pandemic',
  'expert',
  'say',
  'five',
  'state',
  'already',
  'done',
  'ohio',
  'brief',
  'struggle',
  'governor',
  'state',
  'supreme',
  'court',
  'voti

In [20]:
# create empty dictionaries for each canididate

bernie_tokens_dict = {}
biden_tokens_dict = {}
trump_tokens_dict = {}

In [21]:
# create empty dictionaries for each canididate NYT

bernie_NYT_tokens_dict = {}
biden_NYT_tokens_dict = {}
trump_NYT_tokens_dict = {}

In [22]:
# create empty dictionaries for each canididate TWP

bernie_TWP_tokens_dict = {}
biden_TWP_tokens_dict = {}
trump_TWP_tokens_dict = {}

In [23]:
# create empty dictionaries for each canididate WSJ

bernie_WSJ_tokens_dict = {}
biden_WSJ_tokens_dict = {}
trump_WSJ_tokens_dict = {}

In [58]:
# create empty dictionaries for each news outlet

NYT_tokens_dict = {}
TWP_tokens_dict = {}
WSJ_tokens_dict = {}

In [24]:
# Create dictionaries with word frequency as value

for article in bernie_tokens:
    for words in article:
        if words in bernie_tokens_dict.keys():
            bernie_tokens_dict[words] += 1
        else:
            bernie_tokens_dict[words] = 1
        
for article in biden_tokens:
    for words in article:
        if words in biden_tokens_dict.keys():
            biden_tokens_dict[words] += 1
        else:
            biden_tokens_dict[words] = 1

for article in trump_tokens:
    for words in article:
        if words in trump_tokens_dict.keys():
            trump_tokens_dict[words] += 1
        else:
            trump_tokens_dict[words] = 1

In [25]:
# Create dictionaries with word frequency as value NYT

for article in bernie_NYT_tokens:
    for words in article:
        if words in bernie_NYT_tokens_dict.keys():
            bernie_NYT_tokens_dict[words] += 1
        else:
            bernie_NYT_tokens_dict[words] = 1
        
for article in biden_NYT_tokens:
    for words in article:
        if words in biden_NYT_tokens_dict.keys():
            biden_NYT_tokens_dict[words] += 1
        else:
            biden_NYT_tokens_dict[words] = 1

for article in trump_NYT_tokens:
    for words in article:
        if words in trump_NYT_tokens_dict.keys():
            trump_NYT_tokens_dict[words] += 1
        else:
            trump_NYT_tokens_dict[words] = 1

In [26]:
# Create dictionaries with word frequency as value _TWP

for article in bernie_TWP_tokens:
    for words in article:
        if words in bernie_TWP_tokens_dict.keys():
            bernie_TWP_tokens_dict[words] += 1
        else:
            bernie_TWP_tokens_dict[words] = 1
        
for article in biden_TWP_tokens:
    for words in article:
        if words in biden_TWP_tokens_dict.keys():
            biden_TWP_tokens_dict[words] += 1
        else:
            biden_TWP_tokens_dict[words] = 1

for article in trump_TWP_tokens:
    for words in article:
        if words in trump_TWP_tokens_dict.keys():
            trump_TWP_tokens_dict[words] += 1
        else:
            trump_TWP_tokens_dict[words] = 1

In [27]:
# Create dictionaries with word frequency as value _WSJ

for article in bernie_WSJ_tokens:
    for words in article:
        if words in bernie_WSJ_tokens_dict.keys():
            bernie_WSJ_tokens_dict[words] += 1
        else:
            bernie_WSJ_tokens_dict[words] = 1
        
for article in biden_WSJ_tokens:
    for words in article:
        if words in biden_WSJ_tokens_dict.keys():
            biden_WSJ_tokens_dict[words] += 1
        else:
            biden_WSJ_tokens_dict[words] = 1

for article in trump_WSJ_tokens:
    for words in article:
        if words in trump_WSJ_tokens_dict.keys():
            trump_WSJ_tokens_dict[words] += 1
        else:
            trump_WSJ_tokens_dict[words] = 1

In [59]:
# create dictionary with word frequency value for each news outlet
for article in NYT_tokens:
    for words in article:
        if words in NYT_tokens_dict.keys():
            NYT_tokens_dict[words] += 1
        else:
            NYT_tokens_dict[words] = 1
            
for article in TWP_tokens:
    for words in article:
        if words in TWP_tokens_dict.keys():
            TWP_tokens_dict[words] += 1
        else:
            TWP_tokens_dict[words] = 1
            
for article in WSJ_tokens:
    for words in article:
        if words in WSJ_tokens_dict.keys():
            WSJ_tokens_dict[words] += 1
        else:
            WSJ_tokens_dict[words] = 1

In [28]:
# take a look at dictionary
bernie_tokens_dict

{'hitas': 5,
 'primary': 1390,
 'race': 1068,
 'becomes': 42,
 'battle': 94,
 'field': 313,
 'suddenly': 26,
 'seems': 85,
 'constricting': 2,
 'divisiveness': 10,
 'negative': 61,
 'attack': 348,
 'tagged': 3,
 'rival': 358,
 'nickname': 6,
 'like': 950,
 'sleepy': 9,
 'liddle': 1,
 'push': 84,
 'come': 346,
 'shove': 5,
 'democratic': 2661,
 'candidate': 2284,
 'surrogate': 98,
 'descended': 8,
 'level': 113,
 'personal': 111,
 'animus': 2,
 'rare': 16,
 'era': 47,
 'president': 2156,
 'monday': 315,
 'joseph': 186,
 'launched': 33,
 'digital': 78,
 'ad': 355,
 'south': 730,
 'carolina': 521,
 'saying': 271,
 'bernie': 1193,
 'ca': 170,
 'trusted': 15,
 'weighing': 21,
 'barack': 138,
 'pete': 338,
 'buttigieg': 1118,
 'attacked': 49,
 'victory': 343,
 'nevada': 483,
 'mayor': 565,
 'bill': 157,
 'de': 148,
 'blasio': 7,
 'york': 736,
 'lectured': 1,
 'twitter': 163,
 'smug': 1,
 'got': 223,
 'kicked': 7,
 'lest': 1,
 'last': 749,
 'week': 817,
 'debate': 777,
 'recede': 1,
 'history

In [29]:
bernie_WSJ_tokens_dict

{'increasingly': 20,
 'strict': 3,
 'measure': 16,
 'designed': 7,
 'slow': 3,
 'spread': 16,
 'coronavirus': 93,
 'raising': 9,
 'concern': 76,
 'feasibility': 6,
 'holding': 32,
 'remaining': 12,
 'democratic': 962,
 'presidential': 258,
 'primary': 488,
 'contest': 191,
 'well': 123,
 'logistics': 4,
 'november': 95,
 'general': 126,
 'election': 487,
 'pandemic': 38,
 'still': 96,
 'nation': 30,
 'complicated': 18,
 'thicket': 2,
 'state': 777,
 'federal': 76,
 'law': 34,
 'process': 66,
 'selecting': 3,
 'nominee': 111,
 'fact': 15,
 'political': 176,
 'party': 610,
 'private': 32,
 'organization': 17,
 'set': 47,
 'internal': 10,
 'rule': 54,
 'depending': 5,
 'jurisdiction': 4,
 'official': 134,
 'leader': 85,
 'flexibility': 2,
 'altering': 2,
 'date': 17,
 'possibly': 7,
 'format': 2,
 'response': 31,
 'growing': 24,
 'expert': 36,
 'say': 220,
 'five': 35,
 'already': 57,
 'done': 30,
 'ohio': 41,
 'brief': 5,
 'struggle': 14,
 'governor': 33,
 'supreme': 16,
 'court': 34,
 '

In [60]:
TWP_tokens_dict

{'democrat': 1914,
 'michigan': 156,
 'scrolled': 2,
 'surge': 40,
 'nasty': 11,
 'facebook': 277,
 'meme': 44,
 'elizabeth': 246,
 'warren': 920,
 'last': 735,
 'week': 797,
 'fixing': 2,
 'captured': 10,
 'growing': 92,
 'dislike': 9,
 'candidate': 1774,
 'depicted': 7,
 'smiling': 4,
 'face': 260,
 'mask': 17,
 'behind': 241,
 'hillary': 214,
 'clinton': 487,
 'matt': 45,
 'walter': 37,
 'retired': 61,
 'factory': 10,
 'worker': 110,
 'supporter': 506,
 'bernie': 522,
 'shared': 54,
 'image': 66,
 'onward': 2,
 'eight': 55,
 'group': 502,
 'ten': 31,
 'thousand': 67,
 'potential': 193,
 'eyeball': 5,
 'tap': 12,
 'smartphone': 10,
 'popular': 88,
 'technique': 7,
 'allows': 30,
 'ordinary': 12,
 'american': 921,
 'operate': 15,
 'speed': 10,
 'reminiscent': 7,
 'russian': 196,
 'bot': 3,
 'troll': 8,
 'small': 121,
 'contributed': 202,
 'massive': 89,
 'wave': 35,
 'hostile': 13,
 'democratic': 1870,
 'rival': 266,
 'reflects': 33,
 'rising': 60,
 'divisiveness': 8,
 'party': 1549,


In [30]:
# create dataframe with word and frequency as col
# sort them by frequency
pd_bernie_tokens=pd.DataFrame(list(bernie_tokens_dict.items()), columns=['bernie_word', 'bernie_freq'])
pd_bernie_tokens=pd_bernie_tokens.sort_values(by="bernie_freq" , ascending=False).reset_index(drop=True)[:40]

pd_biden_tokens=pd.DataFrame(list(biden_tokens_dict.items()), columns=['biden_word', 'biden_freq'])
pd_biden_tokens=pd_biden_tokens.sort_values(by="biden_freq" , ascending=False).reset_index(drop=True)[:40]

pd_trump_tokens=pd.DataFrame(list(trump_tokens_dict.items()), columns=['trump_word', 'trump_freq'])
pd_trump_tokens=pd_trump_tokens.sort_values(by="trump_freq" , ascending=False).reset_index(drop=True)[:40]

In [31]:
# create dataframe with word and frequency as col _NYT
# sort them by frequency
pd_bernie_NYT_tokens=pd.DataFrame(list(bernie_NYT_tokens_dict.items()), columns=['bernie_NYT_word', 'bernie_NYT_freq'])
pd_bernie_NYT_tokens=pd_bernie_NYT_tokens.sort_values(by="bernie_NYT_freq" , ascending=False).reset_index(drop=True)[:40]

pd_biden_NYT_tokens=pd.DataFrame(list(biden_NYT_tokens_dict.items()), columns=['biden_NYT_word', 'biden_NYT_freq'])
pd_biden_NYT_tokens=pd_biden_NYT_tokens.sort_values(by="biden_NYT_freq" , ascending=False).reset_index(drop=True)[:40]

pd_trump_NYT_tokens=pd.DataFrame(list(trump_NYT_tokens_dict.items()), columns=['trump_NYT_word', 'trump_NYT_freq'])
pd_trump_NYT_tokens=pd_trump_NYT_tokens.sort_values(by="trump_NYT_freq" , ascending=False).reset_index(drop=True)[:40]

In [32]:
# create dataframe with word and frequency as col _TWP
# sort them by frequency
pd_bernie_TWP_tokens=pd.DataFrame(list(bernie_TWP_tokens_dict.items()), columns=['bernie_TWP_word', 'bernie_TWP_freq'])
pd_bernie_TWP_tokens=pd_bernie_TWP_tokens.sort_values(by="bernie_TWP_freq" , ascending=False).reset_index(drop=True)[:40]

pd_biden_TWP_tokens=pd.DataFrame(list(biden_TWP_tokens_dict.items()), columns=['biden_TWP_word', 'biden_TWP_freq'])
pd_biden_TWP_tokens=pd_biden_TWP_tokens.sort_values(by="biden_TWP_freq" , ascending=False).reset_index(drop=True)[:40]

pd_trump_TWP_tokens=pd.DataFrame(list(trump_TWP_tokens_dict.items()), columns=['trump_TWP_word', 'trump_TWP_freq'])
pd_trump_TWP_tokens=pd_trump_TWP_tokens.sort_values(by="trump_TWP_freq" , ascending=False).reset_index(drop=True)[:40]

In [33]:
# create dataframe with word and frequency as col _WSJ
# sort them by frequency
pd_bernie_WSJ_tokens=pd.DataFrame(list(bernie_WSJ_tokens_dict.items()), columns=['bernie_WSJ_word', 'bernie_WSJ_freq'])
pd_bernie_WSJ_tokens=pd_bernie_WSJ_tokens.sort_values(by="bernie_WSJ_freq" , ascending=False).reset_index(drop=True)[:40]

pd_biden_WSJ_tokens=pd.DataFrame(list(biden_WSJ_tokens_dict.items()), columns=['biden_WSJ_word', 'biden_WSJ_freq'])
pd_biden_WSJ_tokens=pd_biden_WSJ_tokens.sort_values(by="biden_WSJ_freq" , ascending=False).reset_index(drop=True)[:40]

pd_trump_WSJ_tokens=pd.DataFrame(list(trump_WSJ_tokens_dict.items()), columns=['trump_WSJ_word', 'trump_WSJ_freq'])
pd_trump_WSJ_tokens=pd_trump_WSJ_tokens.sort_values(by="trump_WSJ_freq" , ascending=False).reset_index(drop=True)[:40]

In [61]:
# create df with word frew as col for each news outlet

pd_NYT_tokens=pd.DataFrame(list(NYT_tokens_dict.items()), columns=['NYT_word', 'NYT_freq'])
pd_NYT_tokens=pd_NYT_tokens.sort_values(by="NYT_freq" , ascending=False).reset_index(drop=True)[:40]

pd_TWP_tokens=pd.DataFrame(list(TWP_tokens_dict.items()), columns=['TWP_word', 'TWP_freq'])
pd_TWP_tokens=pd_TWP_tokens.sort_values(by="TWP_freq" , ascending=False).reset_index(drop=True)[:40]

pd_WSJ_tokens=pd.DataFrame(list(WSJ_tokens_dict.items()), columns=['WSJ_word', 'WSJ_freq'])
pd_WSJ_tokens=pd_WSJ_tokens.sort_values(by="WSJ_freq" , ascending=False).reset_index(drop=True)[:40]


In [37]:
# create df to display top words for each candidate
candidate_word_freq = pd.concat([pd_bernie_tokens, pd_biden_tokens, pd_trump_tokens], axis=1)
candidate_word_freq.head(20)

Unnamed: 0,bernie_word,bernie_freq,biden_word,biden_freq,trump_word,trump_freq
0,campaign,2808,campaign,3178,president,5547
1,democratic,2661,president,3008,house,2475
2,state,2465,state,2457,republican,2003
3,candidate,2284,democratic,2193,impeachment,1774
4,president,2156,voter,2083,democrat,1715
5,voter,2045,candidate,1919,campaign,1672
6,party,1859,democrat,1717,state,1499
7,democrat,1740,former,1473,senate,1477
8,iowa,1422,election,1453,white,1407
9,former,1411,time,1339,time,1333


In [38]:
bernie_news = pd.concat([pd_bernie_NYT_tokens.head(20),
                        pd_bernie_TWP_tokens.head(20),pd_bernie_WSJ_tokens.head(20)], axis=1)
bernie_news

Unnamed: 0,bernie_NYT_word,bernie_NYT_freq,bernie_TWP_word,bernie_TWP_freq,bernie_WSJ_word,bernie_WSJ_freq
0,campaign,1134,campaign,974,democratic,962
1,state,949,candidate,826,state,777
2,democratic,901,democratic,798,campaign,700
3,candidate,861,president,780,party,610
4,voter,845,state,739,candidate,597
5,president,797,party,700,voter,591
6,time,741,democrat,678,president,579
7,iowa,622,voter,609,primary,488
8,warren,607,people,477,election,487
9,primary,580,warren,474,democrat,485


In [43]:
biden_news = pd.concat([pd_biden_NYT_tokens.head(20),
                        pd_biden_TWP_tokens.head(20),pd_biden_WSJ_tokens.head(20)], axis=1)
biden_news

Unnamed: 0,biden_NYT_word,biden_NYT_freq,biden_TWP_word,biden_TWP_freq,biden_WSJ_word,biden_WSJ_freq
0,campaign,1190,campaign,1137,president,855
1,president,1046,president,1107,campaign,851
2,voter,907,state,862,democratic,711
3,state,890,candidate,727,state,705
4,democratic,766,democratic,716,voter,508
5,time,744,democrat,683,democrat,498
6,candidate,735,voter,668,election,482
7,senator,580,election,613,candidate,457
8,primary,544,former,540,former,408
9,democrat,536,party,521,primary,397


In [39]:
trump_news = pd.concat([pd_trump_NYT_tokens.head(20),
                        pd_trump_TWP_tokens.head(20),pd_trump_WSJ_tokens.head(20)], axis=1)
trump_news

Unnamed: 0,trump_NYT_word,trump_NYT_freq,trump_TWP_word,trump_TWP_freq,trump_WSJ_word,trump_WSJ_freq
0,president,2143,president,1916,president,1488
1,republican,870,house,868,house,773
2,house,834,republican,591,impeachment,686
3,time,730,campaign,585,democrat,592
4,impeachment,595,official,582,republican,542
5,campaign,590,white,559,senate,500
6,democrat,570,democrat,553,campaign,497
7,state,562,state,533,democratic,444
8,senator,524,senate,506,trial,416
9,york,492,impeachment,493,state,404


In [62]:
# create df to display top words for each news outlet
news_word_freq = pd.concat([pd_NYT_tokens, pd_TWP_tokens, pd_WSJ_tokens], axis=1)
news_word_freq.head(20)

Unnamed: 0,NYT_word,NYT_freq,TWP_word,TWP_freq,WSJ_word,WSJ_freq
0,president,3986,president,3803,president,2922
1,campaign,2914,campaign,2696,democratic,2117
2,state,2401,state,2134,campaign,2048
3,time,2215,democrat,1914,state,1886
4,democratic,2035,democratic,1870,democrat,1575
5,voter,2014,candidate,1774,election,1355
6,candidate,1801,house,1663,voter,1320
7,democrat,1683,voter,1588,house,1295
8,senator,1639,party,1549,candidate,1242
9,york,1498,republican,1463,party,1233


In [40]:
# find words that are only associated with each candidate
# trying to see if there is jargon or topics associated with certain candidates

bernie_only = {}
for key in bernie_tokens_dict.keys():
    if key not in biden_tokens_dict.keys():
        if key not in trump_tokens_dict.keys():
            bernie_only[key] = bernie_tokens_dict[key]

biden_only = {}
for key in biden_tokens_dict.keys():
    if key not in bernie_tokens_dict.keys():
        if key not in trump_tokens_dict.keys():
            biden_only[key] = biden_tokens_dict[key]
            
trump_only = {}
for key in trump_tokens_dict.keys():
    if key not in biden_tokens_dict.keys():
        if key not in bernie_tokens_dict.keys():
            trump_only[key] = trump_tokens_dict[key]
     
     

In [41]:
# create dataframe with word and frequency as col
# sort them by frequency
pd_bernie_only=pd.DataFrame(list(bernie_only.items()), columns=['bernie_word_only', 'freq'])
pd_bernie_only=pd_bernie_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

pd_biden_only=pd.DataFrame(list(biden_only.items()), columns=['biden_word_only', 'freq'])
pd_biden_only=pd_biden_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

pd_trump_only=pd.DataFrame(list(trump_only.items()), columns=['trump_word_only', 'freq'])
pd_trump_only=pd_trump_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

In [63]:
# these are all garbage words
only_candidates = pd.concat([pd_bernie_only, pd_biden_only, pd_trump_only],axis=1)
only_candidates.head()

Unnamed: 0,bernie_word_only,freq,biden_word_only,freq.1,trump_word_only,freq.2
0,flav,57,navin,26,modi,109
1,uygur,26,amana,20,klitschko,50
2,gadsden,25,jotzke,18,clemency,44
3,wilkie,24,zuberi,17,abinader,32
4,mfume,20,torner,17,richter,29


In [43]:
# find words that are only associated with each NEWS OUTLET
# trying to see if there is jargon or topics associated with certain NEWS OUTLETS

bernie_NYT_only = {}
for key in bernie_NYT_tokens_dict.keys():
    if key not in bernie_TWP_tokens_dict.keys():
        if key not in bernie_WSJ_tokens_dict.keys():
            bernie_NYT_only[key] = bernie_NYT_tokens_dict[key]
bernie_TWP_only = {}
for key in bernie_TWP_tokens_dict.keys():
    if key not in bernie_NYT_tokens_dict.keys():
        if key not in bernie_WSJ_tokens_dict.keys():
            bernie_TWP_only[key] = bernie_TWP_tokens_dict[key]
bernie_WSJ_only = {}
for key in bernie_WSJ_tokens_dict.keys():
    if key not in bernie_TWP_tokens_dict.keys():
        if key not in bernie_NYT_tokens_dict.keys():
            bernie_WSJ_only[key] = bernie_WSJ_tokens_dict[key]

#BIDEN
biden_NYT_only = {}
for key in biden_NYT_tokens_dict.keys():
    if key not in biden_TWP_tokens_dict.keys():
        if key not in biden_WSJ_tokens_dict.keys():
            biden_NYT_only[key] = biden_NYT_tokens_dict[key]
biden_TWP_only = {}
for key in biden_TWP_tokens_dict.keys():
    if key not in biden_NYT_tokens_dict.keys():
        if key not in biden_WSJ_tokens_dict.keys():
            biden_TWP_only[key] = biden_TWP_tokens_dict[key]
biden_WSJ_only = {}
for key in biden_WSJ_tokens_dict.keys():
    if key not in biden_TWP_tokens_dict.keys():
        if key not in biden_NYT_tokens_dict.keys():
            biden_WSJ_only[key] = biden_WSJ_tokens_dict[key]
            

#TRUMP
trump_NYT_only = {}
for key in trump_NYT_tokens_dict.keys():
    if key not in trump_TWP_tokens_dict.keys():
        if key not in trump_WSJ_tokens_dict.keys():
            trump_NYT_only[key] = trump_NYT_tokens_dict[key]
trump_TWP_only = {}
for key in trump_TWP_tokens_dict.keys():
    if key not in trump_NYT_tokens_dict.keys():
        if key not in trump_WSJ_tokens_dict.keys():
            trump_TWP_only[key] = trump_TWP_tokens_dict[key]
trump_WSJ_only = {}
for key in trump_WSJ_tokens_dict.keys():
    if key not in trump_TWP_tokens_dict.keys():
        if key not in trump_NYT_tokens_dict.keys():
            trump_WSJ_only[key] = trump_WSJ_tokens_dict[key]

In [48]:
#create df for each news 'only' words for each candidate
pd_bernie_NYT_only=pd.DataFrame(list(bernie_NYT_only.items()), columns=['bernie_NYT_word_only', 'freq'])
pd_bernie_NYT_only=pd_bernie_NYT_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

pd_bernie_TWP_only=pd.DataFrame(list(bernie_TWP_only.items()), columns=['bernie_TWP_word_only', 'freq'])
pd_bernie_TWP_only=pd_bernie_TWP_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

pd_bernie_WSJ_only=pd.DataFrame(list(bernie_WSJ_only.items()), columns=['bernie_WSJ_word_only', 'freq'])
pd_bernie_WSJ_only=pd_bernie_WSJ_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

In [49]:
bernie_news_only = pd.concat([pd_bernie_NYT_only, pd_bernie_TWP_only,pd_bernie_WSJ_only], axis=1)
bernie_news_only.head()

Unnamed: 0,bernie_NYT_word_only,freq,bernie_TWP_word_only,freq.1,bernie_WSJ_word_only,freq.2
0,photograph,180,wp,150,eliza,72
1,erin,34,dawson,51,flav,57
2,hitthe,33,mcdowell,31,sabrina,36
3,grenell,28,evangelicals,30,jamerson,34
4,ember,27,tzintzãºn,28,siddiqui,34


In [44]:
# biden
pd_biden_NYT_only=pd.DataFrame(list(biden_NYT_only.items()), columns=['biden_NYT_word_only', 'freq'])
pd_biden_NYT_only=pd_biden_NYT_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

pd_biden_TWP_only=pd.DataFrame(list(biden_TWP_only.items()), columns=['biden_TWP_word_only', 'freq'])
pd_biden_TWP_only=pd_biden_TWP_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

pd_biden_WSJ_only=pd.DataFrame(list(biden_WSJ_only.items()), columns=['biden_WSJ_word_only', 'freq'])
pd_biden_WSJ_only=pd_biden_WSJ_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

In [46]:
biden_news_only = pd.concat([pd_biden_NYT_only, pd_biden_TWP_only,pd_biden_WSJ_only], axis=1)
biden_news_only.head()

Unnamed: 0,biden_NYT_word_only,freq,biden_TWP_word_only,freq.1,biden_WSJ_word_only,freq.2
0,hitthe,29,wp,150,eliza,48
1,glover,25,firtash,63,siddiqui,41
2,glueck,24,doocy,33,sabrina,41
3,onpolitics,21,mcdowell,31,parti,38
4,paso,20,navin,26,tarini,38


In [50]:
# biden
pd_trump_NYT_only=pd.DataFrame(list(trump_NYT_only.items()), columns=['trump_NYT_word_only', 'freq'])
pd_trump_NYT_only=pd_trump_NYT_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

pd_trump_TWP_only=pd.DataFrame(list(trump_TWP_only.items()), columns=['trump_TWP_word_only', 'freq'])
pd_trump_TWP_only=pd_trump_TWP_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

pd_trump_WSJ_only=pd.DataFrame(list(trump_WSJ_only.items()), columns=['trump_WSJ_word_only', 'freq'])
pd_trump_WSJ_only=pd_trump_WSJ_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

In [51]:
trump_news_only = pd.concat([pd_trump_NYT_only, pd_trump_TWP_only,pd_trump_WSJ_only], axis=1)
trump_news_only.head()

Unnamed: 0,trump_NYT_word_only,freq,trump_TWP_word_only,freq.1,trump_WSJ_word_only,freq.2
0,suleimani,61,wp,150,ballhaus,60
1,hitthe,41,klitschko,50,lucey,55
2,erin,35,abinader,32,restuccia,29
3,richter,29,assange,22,lindsay,27
4,anna,28,dominican,20,leary,26


In [64]:
# find words that are only associated with each news outlet
# trying to see if there is jargon or topics associated with certain news outlet

NYT_only = {}
for key in NYT_tokens_dict.keys():
    if key not in WSJ_tokens_dict.keys():
        if key not in TWP_tokens_dict.keys():
            NYT_only[key] = NYT_tokens_dict[key]

TWP_only = {}
for key in TWP_tokens_dict.keys():
    if key not in NYT_tokens_dict.keys():
        if key not in WSJ_tokens_dict.keys():
            TWP_only[key] = TWP_tokens_dict[key]
            
WSJ_only = {}
for key in WSJ_tokens_dict.keys():
    if key not in TWP_tokens_dict.keys():
        if key not in NYT_tokens_dict.keys():
            WSJ_only[key] = WSJ_tokens_dict[key]
     


In [65]:
# biden
pd_NYT_only=pd.DataFrame(list(NYT_only.items()), columns=['NYT_word_only', 'freq'])
pd_NYT_only=pd_NYT_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

pd_TWP_only=pd.DataFrame(list(TWP_only.items()), columns=['TWP_word_only', 'freq'])
pd_TWP_only=pd_TWP_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

pd_WSJ_only=pd.DataFrame(list(WSJ_only.items()), columns=['WSJ_word_only', 'freq'])
pd_WSJ_only=pd_WSJ_only.sort_values(by="freq" , ascending=False).reset_index(drop=True)[:40]

In [66]:
news_only = pd.concat([pd_NYT_only, pd_TWP_only, pd_WSJ_only],axis=1)
news_only

Unnamed: 0,NYT_word_only,freq,TWP_word_only,freq.1,WSJ_word_only,freq.2
0,hitthe,103,wp,450,eliza,139
1,suleimani,90,firtash,65,wsj,90
2,ember,49,mcdowell,62,lucey,88
3,glueck,43,dawson,52,sabrina,88
4,haberman,37,klitschko,50,siddiqui,86
5,qassim,36,doocy,34,ballhaus,86
6,sydney,32,abinader,32,parti,80
7,richter,29,tzintzãºn,29,tarini,80
8,onpolitics,29,wagner,27,jamerson,76
9,corasaniti,29,navin,26,flav,57
