In [1]:
import pandas as pd
import nltk.data
from os import listdir
from os.path import isfile, join
from nltk.util import bigrams 
from nltk.tokenize import TreebankWordTokenizer
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()
stop_words = nltk.corpus.stopwords.words('english')
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [6]:
# load in each candidate data
bernie = pd.read_csv("./Data/All_Candidates/Bernie_Sanders.csv")
biden = pd.read_csv("./Data/All_Candidates/Joe_Biden.csv")
trump = pd.read_csv("./Data/All_Candidates/Donald_Trump.csv")

In [7]:
bernie.head()

Unnamed: 0,title,text,media,word_count,candidate_name
0,A Sign of the Times? The Democratic Primary Ha...,Hide highlightingFull TextTranslateUndo Transl...,New York Times,1122.0,Bernie Sanders
1,"Tops in Iowa, Under Attack At Every Turn: [Nat...",Hide highlightingFull TextTranslateUndo Transl...,New York Times,1995.0,Bernie Sanders
2,Gender and War Dominate Debate by 6 Democrats:...,Hide highlightingFull TextTranslateUndo Transl...,New York Times,2154.0,Bernie Sanders
3,"Once Hawking Big-Ticket Ideas, Democrats Refoc...",Hide highlightingFull TextTranslateUndo Transl...,New York Times,1892.0,Bernie Sanders
4,Sanders Gets Endorsement Of Young Climate Grou...,Hide highlightingFull TextTranslateUndo Transl...,New York Times,1005.0,Bernie Sanders


In [8]:
# extract text
bernie_text = bernie.iloc[:,1]

biden_text = biden.iloc[:,1]

trump_text = trump.iloc[:,1]

In [9]:
# take a look at what is actually in the text
bernie_text.head()

0    Hide highlightingFull TextTranslateUndo Transl...
1    Hide highlightingFull TextTranslateUndo Transl...
2    Hide highlightingFull TextTranslateUndo Transl...
3    Hide highlightingFull TextTranslateUndo Transl...
4    Hide highlightingFull TextTranslateUndo Transl...
Name: text, dtype: object

In [10]:
# words I have found in the beggining & end of each text due to scraping
remove = ['hide','highlightingfull', 'texttranslateundo', 'translation', 'fromtotranslatetranslation', 'progress', 'missing', 'key',
 'loadinganimation', 'full', 'text', 'may', 'take', 'second', 'translate', 'larger', 'document', 'may', 'take', 'longer', 'cancel',
 'overlayendturn', 'search', 'term', 'navigationturn', 'navigation', 'jump', 'first', 'hit','article', 'write', 'julie', 'bykowicz',
 'credit', 'julie', 'bykowicz', 'word', 'count', 'lessyou', 'requested', 'machine', 'selected', 'content', 'database', 'functionality',
 'provided', 'solely', 'convenience', 'way', 'intended', 'replace', 'human', 'show', 'disclaimerneither', 'proquest', 'licensors',
 'make', 'representation', 'warranty', 'respect', 'automatically', 'generated', 'available', 'retained', 'system', 'proquest',
 'licensors', 'specifically', 'disclaim', 'express', 'implied', 'warranty', 'including', 'without', 'limitation', 'warranty', 'availability',
 'accuracy', 'timeliness', 'completeness', 'merchantability', 'fitness', 'particular', 'purpose', 'use', 'subject', 'use', 'restriction',
 'contained', 'electronic', 'product', 'license', 'agreement', 'using', 'functionality', 'agree', 'forgo', 'claim', 'proquest', 'licensors',
 'use', 'functionality', 'output' 'derived', 'disclaimer','rather','keep', 'waiting','translated', 'paragraph', 'click', 'button', 
 'want', 'rest', 'allcopyright', 'dow', 'jones', 'company', 'right', 'reserved', 'said', 'ha', 'wa', 'biden', 'trump', 'sander',
'would','new', 'two', 'one']

In [11]:
# include removed words in stop words
stopwords = stop_words + remove

In [12]:
# function to pre-process data

def pre_process_text(text):

    # split into tokens
    #tokens = treebank_tokenizer.tokenize(text)
    
    # take out puncutation and numbers
    words = [word for word in text if word.isalpha()]
    
    # convert to lower case
    words_l = [word.lower() for word in words]
    
    # get lemmas
    lemmatized_words = [wordnet_lemmatizer.lemmatize(word) for word in words_l]
    
    # taking out stop_words + remove
    output = [w for w in lemmatized_words if w not in stopwords]
    
    # the later function assumes you are returning a list of terms
    return output

In [14]:
# create list of tokenized articles for each candidate
import nltk
#nltk.download('wordnet')

bernie_tokens = []
for news in bernie_text[:]:
    content = nltk.word_tokenize(news)
    bernie_tokens.append(pre_process_text(content))

biden_tokens = []
for news in biden_text[:]:
    content = nltk.word_tokenize(news)
    biden_tokens.append(pre_process_text(content))

trump_tokens = []
for news in trump_text[:]:
    content = nltk.word_tokenize(news)
    trump_tokens.append(pre_process_text(content))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


In [15]:
# check out list of token articles
bernie_tokens[0:5]

[['hitas',
  'primary',
  'race',
  'becomes',
  'battle',
  'field',
  'suddenly',
  'seems',
  'constricting',
  'divisiveness',
  'negative',
  'attack',
  'tagged',
  'rival',
  'nickname',
  'like',
  'sleepy',
  'liddle',
  'push',
  'come',
  'shove',
  'democratic',
  'primary',
  'candidate',
  'surrogate',
  'descended',
  'level',
  'personal',
  'animus',
  'rare',
  'era',
  'president',
  'monday',
  'joseph',
  'launched',
  'digital',
  'ad',
  'south',
  'carolina',
  'saying',
  'bernie',
  'ca',
  'trusted',
  'weighing',
  'primary',
  'president',
  'barack',
  'pete',
  'buttigieg',
  'attacked',
  'victory',
  'nevada',
  'mayor',
  'bill',
  'de',
  'blasio',
  'york',
  'lectured',
  'buttigieg',
  'twitter',
  'saying',
  'smug',
  'got',
  'kicked',
  'lest',
  'last',
  'week',
  'democratic',
  'debate',
  'recede',
  'history',
  'five',
  'day',
  'ago',
  'memorable',
  'searing',
  'moment',
  'highly',
  'personal',
  'exchange',
  'buttigieg',
  'amy'

In [16]:
# create empty dictionaries for each canididate

bernie_tokens_dict = {}
biden_tokens_dict = {}
trump_tokens_dict = {}

In [17]:
# Create dictionaries with word frequency as value

for article in bernie_tokens:
    for words in article:
        if words in bernie_tokens_dict.keys():
            bernie_tokens_dict[words] += 1
        else:
            bernie_tokens_dict[words] = 1
        
for article in biden_tokens:
    for words in article:
        if words in biden_tokens_dict.keys():
            biden_tokens_dict[words] += 1
        else:
            biden_tokens_dict[words] = 1

for article in trump_tokens:
    for words in article:
        if words in trump_tokens_dict.keys():
            trump_tokens_dict[words] += 1
        else:
            trump_tokens_dict[words] = 1

In [18]:
# take a look at dictionary
bernie_tokens_dict

{'hitas': 5,
 'primary': 1390,
 'race': 1068,
 'becomes': 42,
 'battle': 94,
 'field': 313,
 'suddenly': 26,
 'seems': 85,
 'constricting': 2,
 'divisiveness': 10,
 'negative': 61,
 'attack': 348,
 'tagged': 3,
 'rival': 358,
 'nickname': 6,
 'like': 950,
 'sleepy': 9,
 'liddle': 1,
 'push': 84,
 'come': 346,
 'shove': 5,
 'democratic': 2661,
 'candidate': 2284,
 'surrogate': 98,
 'descended': 8,
 'level': 113,
 'personal': 111,
 'animus': 2,
 'rare': 16,
 'era': 47,
 'president': 2156,
 'monday': 315,
 'joseph': 186,
 'launched': 33,
 'digital': 78,
 'ad': 355,
 'south': 730,
 'carolina': 521,
 'saying': 271,
 'bernie': 1193,
 'ca': 170,
 'trusted': 15,
 'weighing': 21,
 'barack': 138,
 'pete': 338,
 'buttigieg': 1118,
 'attacked': 49,
 'victory': 343,
 'nevada': 483,
 'mayor': 565,
 'bill': 157,
 'de': 148,
 'blasio': 7,
 'york': 736,
 'lectured': 1,
 'twitter': 163,
 'smug': 1,
 'got': 223,
 'kicked': 7,
 'lest': 1,
 'last': 749,
 'week': 817,
 'debate': 777,
 'recede': 1,
 'history

In [50]:
# create dataframe with word and frequency as col
# sort them by frequency
pd_bernie_tokens=pd.DataFrame(list(bernie_tokens_dict.items()), columns=['bernie_word', 'bernie_freq'])
pd_bernie_tokens=pd_bernie_tokens.sort_values(by="bernie_freq" , ascending=False).reset_index(drop=True)

pd_biden_tokens=pd.DataFrame(list(biden_tokens_dict.items()), columns=['biden_word', 'biden_freq'])
pd_biden_tokens=pd_biden_tokens.sort_values(by="biden_freq" , ascending=False).reset_index(drop=True)

pd_trump_tokens=pd.DataFrame(list(trump_tokens_dict.items()), columns=['trump_word', 'trump_freq'])
pd_trump_tokens=pd_trump_tokens.sort_values(by="trump_freq" , ascending=False).reset_index(drop=True)

In [51]:
pd_bernie_tokens.head(20)
pd_biden_tokens.head(20)
pd_trump_tokens.head(20)

Unnamed: 0,trump_word,trump_freq
0,president,5547
1,house,2475
2,republican,2003
3,impeachment,1774
4,democrat,1715
5,campaign,1672
6,state,1499
7,senate,1477
8,white,1407
9,time,1333


In [101]:
# See how only threads in each candidates
print(f'There are {len(pd_bernie_tokens)} threads')
print(f'There are {len(pd_biden_tokens)} threads')
print(f'There are {len(pd_trump_tokens)} threads')

There are 16029 threads
There are 16526 threads
There are 16856 threads


In [32]:
# Doing Sentiment Analysis for each candidates
from nltk.sentiment.vader import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...


In [69]:
# Test each token as positive, negative or neutral, and generate a score
# Bernie first
bernie_table = []
for i in pd_bernie_tokens['bernie_word']:
    bernie_scores = {}
    bernie_scores['sia_positive'] = sia.polarity_scores(i)['pos']
    bernie_scores['sia_negative'] = sia.polarity_scores(i)['neg']
    bernie_scores['sia_neutral'] = sia.polarity_scores(i)['neu']
    bernie_scores['sia_compound'] = sia.polarity_scores(i)['compound']
    
    bernie_table.append(bernie_scores)
    
#print(sia_table)

In [70]:
# See the results
bernie_table = pd.DataFrame(bernie_table)
bernie_table=bernie_table.sort_values(by='sia_compound' , ascending=False).reset_index(drop=True)

print(bernie_table)

       sia_positive  sia_negative  sia_neutral  sia_compound
0               1.0           0.0          0.0        0.6486
1               1.0           0.0          0.0        0.6369
2               1.0           0.0          0.0        0.6369
3               1.0           0.0          0.0        0.6369
4               1.0           0.0          0.0        0.6369
...             ...           ...          ...           ...
16024           0.0           1.0          0.0       -0.6908
16025           0.0           1.0          0.0       -0.6908
16026           0.0           1.0          0.0       -0.6908
16027           0.0           1.0          0.0       -0.6908
16028           0.0           1.0          0.0       -0.7096

[16029 rows x 4 columns]


In [91]:
# See what kinds of sentiment appear more
bernie_table[bernie_table > 0].count()

sia_positive      878
sia_negative     1139
sia_neutral     13999
sia_compound      878
dtype: int64

In [85]:
bernie_table[bernie_table == 0].count()

sia_positive    15151
sia_negative    14890
sia_neutral      2030
sia_compound    14012
dtype: int64

In [87]:
bernie_table[bernie_table < 0].count()

sia_positive       0
sia_negative       0
sia_neutral        0
sia_compound    1139
dtype: int64

In [None]:
# Bernie has 7.1% of negative sentiments, 5.5 % of positive sentiments, and 87.4% of neutral sentiments.

In [67]:
# Biden
biden_table = []
for i in pd_biden_tokens['biden_word']:
    biden_scores = {}
    biden_scores['sia_positive'] = sia.polarity_scores(i)['pos']
    biden_scores['sia_negative'] = sia.polarity_scores(i)['neg']
    biden_scores['sia_neutral'] = sia.polarity_scores(i)['neu']
    biden_scores['sia_compound'] = sia.polarity_scores(i)['compound']
    
    biden_table.append(biden_scores)
    
biden_table = pd.DataFrame(biden_table)
biden_table=biden_table.sort_values(by='sia_compound' , ascending=False).reset_index(drop=True)
print(biden_table)

       sia_positive  sia_negative  sia_neutral  sia_compound
0               1.0           0.0          0.0        0.6486
1               1.0           0.0          0.0        0.6369
2               1.0           0.0          0.0        0.6369
3               1.0           0.0          0.0        0.6369
4               1.0           0.0          0.0        0.6369
...             ...           ...          ...           ...
16521           0.0           1.0          0.0       -0.6908
16522           0.0           1.0          0.0       -0.6908
16523           0.0           1.0          0.0       -0.7003
16524           0.0           1.0          0.0       -0.7003
16525           0.0           1.0          0.0       -0.7096

[16526 rows x 4 columns]


In [92]:
biden_table[biden_table > 0].count()

sia_positive      895
sia_negative     1177
sia_neutral     14439
sia_compound      895
dtype: int64

In [94]:
biden_table[biden_table == 0].count()

sia_positive    15631
sia_negative    15349
sia_neutral      2087
sia_compound    14454
dtype: int64

In [95]:
biden_table[biden_table < 0].count()

sia_positive       0
sia_negative       0
sia_neutral        0
sia_compound    1177
dtype: int64

In [None]:
# Biden has 7.1% of negative sentiments, 5.4 % of positive sentiments, and 87.5% of neutral sentiments. 

In [68]:
# Trump
trump_table = []
for i in pd_trump_tokens['trump_word']:
    trump_scores = {}
    trump_scores['sia_positive'] = sia.polarity_scores(i)['pos']
    trump_scores['sia_negative'] = sia.polarity_scores(i)['neg']
    trump_scores['sia_neutral'] = sia.polarity_scores(i)['neu']
    trump_scores['sia_compound'] = sia.polarity_scores(i)['compound']
    
    trump_table.append(trump_scores)
    
trump_table = pd.DataFrame(trump_table)
trump_table=trump_table.sort_values(by='sia_compound' , ascending=False).reset_index(drop=True)
print(trump_table)

       sia_positive  sia_negative  sia_neutral  sia_compound
0               1.0           0.0          0.0        0.6369
1               1.0           0.0          0.0        0.6369
2               1.0           0.0          0.0        0.6369
3               1.0           0.0          0.0        0.6369
4               1.0           0.0          0.0        0.6369
...             ...           ...          ...           ...
16851           0.0           1.0          0.0       -0.6908
16852           0.0           1.0          0.0       -0.6908
16853           0.0           1.0          0.0       -0.6908
16854           0.0           1.0          0.0       -0.7003
16855           0.0           1.0          0.0       -0.7096

[16856 rows x 4 columns]


In [96]:
trump_table[trump_table > 0].count()

sia_positive      931
sia_negative     1207
sia_neutral     14703
sia_compound      931
dtype: int64

In [99]:
trump_table[trump_table == 0].count()

sia_positive    15925
sia_negative    15649
sia_neutral      2153
sia_compound    14718
dtype: int64

In [100]:
trump_table[trump_table < 0].count()

sia_positive       0
sia_negative       0
sia_neutral        0
sia_compound    1207
dtype: int64

In [None]:
# Trump has 7.16% of negative sentiments, 5.5% of positive sentiments, and 87.3% of neutral sentiments. 