In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import json

topics = ['trump_loss','biden_president','election_fraud','covid','anti_trump','trump_last_month']

cons = pd.read_csv("data/cons_25-27.csv")
poli = pd.read_csv("data/poli_25-27.csv")

#all posts
df = pd.concat([cons,poli],ignore_index=True)

In [2]:
# one df for each topic x cons/poli
cons_dict = dict()
for topic in topics:
    cons_dict[topic] = cons.loc[cons[topic] == 1]['data.title']
 
poli_dict = dict()
for topic in topics:
    poli_dict[topic] = poli.loc[poli[topic] == 1]['data.title']


In [3]:
all_post_dict = dict()
all_post_dict['cons'] = cons_dict
all_post_dict['poli'] = poli_dict


In [4]:
from collections import MutableMapping 
  
# Convert ini_dict to flattened dictionary 
# default seperater '_' 
def convert_flatten(d, parent_key ='', sep ='_'): 
    items = [] 
    for k, v in d.items(): 
        new_key = parent_key + sep + k if parent_key else k 
        if isinstance(v, MutableMapping): 
            items.extend(convert_flatten(v, new_key, sep = sep).items()) 
        else: 
            items.append((new_key, v)) 
    return dict(items) 
all_posts_dict = convert_flatten(all_post_dict)

  from collections import MutableMapping


In [5]:
posts = list(df['data.title'])
all_posts = '. '.join(posts)
all_posts = re.sub('[^A-Za-z]+', ' ', all_posts)
all_posts = all_posts.lower()


In [6]:
def word_count(posts, topic):
    '''
    Count word frequency for each word in all posts or the posts of the given topic
    
    Parameters
    ----------
    posts : str
        a long String containing all prosessed sentences
    topic : str
        name of the topic, or "total"
        
    Returns
    -------
    wordCounts : pandas.DataFrame
        contains the word frequencies
    '''
    counts = dict()
    words = posts.split()

    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
            
    wordCounts =  pd.DataFrame(counts.items(), columns=['word', topic])
    #sort = sort.sort_values(by=['topic1'], ascending = False)
    wordCounts = wordCounts[wordCounts[topic] >=5]
    wordCounts = wordCounts.reset_index(drop=True)

    return wordCounts


In [7]:
# idf
def idf(wordCountsAll): 
    '''
    Adds a column of idf to the dataframe
    '''
    wordCountsAll.loc['total']= wordCountsAll.sum(numeric_only=True)
    wordCountsAll['total'] = wordCountsAll.sum(numeric_only=True,axis=1)
    grandSum = wordCountsAll['total']['total']
    wordCountsAll['idf']=np.log(grandSum/wordCountsAll['total'])


In [8]:
word_counts = word_count(all_posts, 'total')
idf(word_counts)
word_counts

Unnamed: 0,word,total,idf
0,after,29.0,4.863548
1,trump,335.0,2.416713
2,supporters,12.0,5.745937
3,on,53.0,4.260552
4,his,39.0,4.567282
...,...,...,...
198,mocked,5.0,6.621406
199,ever,5.0,6.621406
200,talk,7.0,6.284933
201,wants,5.0,6.621406


In [9]:
# word counts for each df of party_topic
for party_topic in list(all_posts_dict.keys()):
    posts = list(all_posts_dict[party_topic])
    temp = '.'.join(posts)
    temp = re.sub('[^A-Za-z]+', ' ', temp)
    temp = temp.lower()
    all_posts_dict[party_topic] = str(temp)
    temp_counts = word_count(temp, party_topic)
    word_counts = pd.merge(left=word_counts, right=temp_counts,how='left', on='word')


In [10]:
word_counts

Unnamed: 0,word,total,idf,cons_trump_loss,cons_biden_president,cons_election_fraud,cons_covid,cons_anti_trump,cons_trump_last_month,poli_trump_loss,poli_biden_president,poli_election_fraud,poli_covid,poli_anti_trump,poli_trump_last_month
0,after,29.0,4.863548,,,,,,,6.0,8.0,,,13.0,6.0
1,trump,335.0,2.416713,27.0,7.0,24.0,5.0,14.0,31.0,95.0,39.0,53.0,11.0,103.0,105.0
2,supporters,12.0,5.745937,,,,,,,,,,,,
3,on,53.0,4.260552,,,5.0,,,,10.0,14.0,5.0,,9.0,11.0
4,his,39.0,4.567282,,,,,,,6.0,8.0,5.0,,12.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,mocked,5.0,6.621406,,,,,,,,,,,5.0,
199,ever,5.0,6.621406,,,,,,,,,,,,
200,talk,7.0,6.284933,,,,,,,,,,,,
201,wants,5.0,6.621406,,,,,,,,,,,,


In [11]:
word_counts['cons'] = word_counts['cons_trump_loss']+word_counts['cons_biden_president']+word_counts['cons_election_fraud']+word_counts['cons_covid']+word_counts['cons_anti_trump']+word_counts['cons_trump_last_month']
word_counts['poli'] = word_counts['poli_trump_loss']+word_counts['poli_biden_president']+word_counts['poli_election_fraud']+word_counts['poli_covid']+word_counts['poli_anti_trump']+word_counts['poli_trump_last_month']

word_counts['trump_loss']=word_counts['cons_trump_loss']+word_counts['poli_trump_loss']
word_counts['biden_president']=word_counts['cons_biden_president']+word_counts['poli_biden_president']
word_counts['election_fraud']=word_counts['cons_election_fraud']+word_counts['poli_election_fraud']
word_counts['covid']=word_counts['cons_covid']+word_counts['poli_covid']
word_counts['anti_trump']=word_counts['cons_anti_trump']+word_counts['poli_anti_trump']
word_counts['trump_last_month']=word_counts['cons_trump_last_month']+word_counts['poli_trump_last_month']
word_counts

Unnamed: 0,word,total,idf,cons_trump_loss,cons_biden_president,cons_election_fraud,cons_covid,cons_anti_trump,cons_trump_last_month,poli_trump_loss,...,poli_anti_trump,poli_trump_last_month,cons,poli,trump_loss,biden_president,election_fraud,covid,anti_trump,trump_last_month
0,after,29.0,4.863548,,,,,,,6.0,...,13.0,6.0,,,,,,,,
1,trump,335.0,2.416713,27.0,7.0,24.0,5.0,14.0,31.0,95.0,...,103.0,105.0,108.0,406.0,122.0,46.0,77.0,16.0,117.0,136.0
2,supporters,12.0,5.745937,,,,,,,,...,,,,,,,,,,
3,on,53.0,4.260552,,,5.0,,,,10.0,...,9.0,11.0,,,,,10.0,,,
4,his,39.0,4.567282,,,,,,,6.0,...,12.0,8.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,mocked,5.0,6.621406,,,,,,,,...,5.0,,,,,,,,,
199,ever,5.0,6.621406,,,,,,,,...,,,,,,,,,,
200,talk,7.0,6.284933,,,,,,,,...,,,,,,,,,,
201,wants,5.0,6.621406,,,,,,,,...,,,,,,,,,,


In [12]:
def tfidf(wordCountsAll):
    '''
    Calculates the TD-IDF for words in each topics
    
    Parameter
    ---------
    wordCountsAll : pandas.DataFrame
        df of word, topics, total, idf; entries: word frequency
    
    Returns
    -------
    tfidf : pandas.DataFrame
        df of word, topics; entries : tfidf
    '''
    tfidf = wordCountsAll.copy()
    for column in tfidf.columns:
        if column not in ['word','total','idf']:
            tfidf[column]=tfidf[column]*tfidf['idf']
            
    tfidf = tfidf.drop(['total','idf'], axis=1)

    return tfidf

tfidf_values = tfidf(word_counts)
tfidf_values

Unnamed: 0,word,cons_trump_loss,cons_biden_president,cons_election_fraud,cons_covid,cons_anti_trump,cons_trump_last_month,poli_trump_loss,poli_biden_president,poli_election_fraud,...,poli_anti_trump,poli_trump_last_month,cons,poli,trump_loss,biden_president,election_fraud,covid,anti_trump,trump_last_month
0,after,,,,,,,29.181286,38.908382,,...,63.226121,29.181286,,,,,,,,
1,trump,65.251252,16.916991,58.001113,12.083565,33.833982,74.918104,229.587738,94.251808,128.085791,...,248.921442,253.754868,261.005007,981.185491,294.83899,111.168799,186.086903,38.667409,282.755425,328.672972
2,supporters,,,,,,,,,,...,,,,,,,,,,
3,on,,,21.302758,,,,42.605517,59.647723,21.302758,...,38.344965,46.866068,,,,,42.605517,,,
4,his,,,,,,,27.403692,36.538255,22.836410,...,54.807383,36.538255,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,mocked,,,,,,,,,,...,33.107028,,,,,,,,,
199,ever,,,,,,,,,,...,,,,,,,,,,
200,talk,,,,,,,,,,...,,,,,,,,,,
201,wants,,,,,,,,,,...,,,,,,,,,,


In [13]:
# remove non-words and stop words
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
stop_words.extend(['s','ll','re','t','d','m'])
tfidf_values = tfidf_values[~tfidf_values['word'].isin(stop_words)]
tfidf_values

Unnamed: 0,word,cons_trump_loss,cons_biden_president,cons_election_fraud,cons_covid,cons_anti_trump,cons_trump_last_month,poli_trump_loss,poli_biden_president,poli_election_fraud,...,poli_anti_trump,poli_trump_last_month,cons,poli,trump_loss,biden_president,election_fraud,covid,anti_trump,trump_last_month
1,trump,65.251252,16.916991,58.001113,12.083565,33.833982,74.918104,229.587738,94.251808,128.085791,...,248.921442,253.754868,261.005007,981.185491,294.838990,111.168799,186.086903,38.667409,282.755425,328.672972
2,supporters,,,,,,,,,,...,,,,,,,,,,
11,president,21.594103,,,,,21.594103,56.144667,51.825847,,...,56.144667,43.188206,,,77.738770,,,,,64.782308
12,donald,24.493195,,,,,,44.087751,,,...,44.087751,48.986391,,,68.580947,,,,,
13,voters,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
198,mocked,,,,,,,,,,...,33.107028,,,,,,,,,
199,ever,,,,,,,,,,...,,,,,,,,,,
200,talk,,,,,,,,,,...,,,,,,,,,,
201,wants,,,,,,,,,,...,,,,,,,,,,


In [14]:
# for each topic, extract 10 words with highest tfidf
def word_list(tfidf_table):
    '''
    For each topic, extract 10 words with highest tfidf
    
    Parameter
    ---------
    tfidf_table : pandas.DataFrame
        df of word, topics; entries : tfidf
    
    Returns
    -------
    word_list : dict
        keys are the topics and values are lists containing the top 10 words
    '''
    sort = tfidf_table.copy()
    word_list = dict()
    for topic in list(tfidf_table.columns):
        if topic != 'word':
        #words = list()
            sort = sort.sort_values(by=[topic], ascending = False)
            words = list(sort['word'][:10])
            
            word_list[topic] = words

    return word_list
top_words = word_list(tfidf_values)
j = json.dumps(top_words)
j

'{"cons_trump_loss": ["trump", "biden", "says", "donald", "president", "election", "supporters", "voters", "joe", "first"], "cons_biden_president": ["biden", "trump", "says", "donald", "president", "election", "supporters", "voters", "joe", "first"], "cons_election_fraud": ["trump", "nevada", "court", "ballot", "could", "lets", "present", "huge", "overturn", "result"], "cons_covid": ["coronavirus", "trump", "nevada", "court", "ballot", "could", "lets", "present", "huge", "overturn"], "cons_anti_trump": ["trump", "coronavirus", "nevada", "court", "ballot", "could", "lets", "present", "huge", "overturn"], "cons_trump_last_month": ["trump", "flynn", "pardons", "michael", "security", "former", "national", "president", "coronavirus", "nevada"], "poli_trump_loss": ["trump", "says", "election", "biden", "leave", "college", "house", "electoral", "white", "president"], "poli_biden_president": ["biden", "trump", "president", "joe", "says", "administration", "thanksgiving", "milwaukee", "transiti

In [15]:
results = pd.DataFrame.from_dict(top_words)

In [16]:
results_cons = results[results.columns[0:6]] 
print(results_cons.to_latex(index=False))  

\begin{tabular}{llllll}
\toprule
cons\_trump\_loss & cons\_biden\_president & cons\_election\_fraud &   cons\_covid & cons\_anti\_trump & cons\_trump\_last\_month \\
\midrule
          trump &                biden &               trump &  coronavirus &           trump &                 trump \\
          biden &                trump &              nevada &        trump &     coronavirus &                 flynn \\
           says &                 says &               court &       nevada &          nevada &               pardons \\
         donald &               donald &              ballot &        court &           court &               michael \\
      president &            president &               could &       ballot &          ballot &              security \\
       election &             election &                lets &        could &           could &                former \\
     supporters &           supporters &             present &         lets &            lets &    

In [17]:
results_poli = results[results.columns[6:12]] 
print(results_poli.to_latex(index=False))  

\begin{tabular}{llllll}
\toprule
poli\_trump\_loss & poli\_biden\_president & poli\_election\_fraud &    poli\_covid & poli\_anti\_trump & poli\_trump\_last\_month \\
\midrule
          trump &                biden &               trump &  thanksgiving &           trump &                 trump \\
           says &                trump &            election &      pandemic &        election &                  says \\
       election &            president &               fraud &         trump &            says &               georgia \\
          biden &                  joe &               voter &         biden &           fraud &              election \\
          leave &                 says &               group &      election &       president &               college \\
        college &       administration &                 pro &         fraud &         georgia &                 leave \\
          house &         thanksgiving &               donor &         voter &           bid

In [18]:
results_state = results[results.columns[12:14]] 
print(results_state.to_latex(index=False))  

\begin{tabular}{ll}
\toprule
      cons &       poli \\
\midrule
     trump &      trump \\
      says &      biden \\
   georgia &       says \\
  election &    georgia \\
   college &   election \\
     leave &    college \\
 electoral &      leave \\
     house &  electoral \\
     white &      house \\
     biden &      white \\
\bottomrule
\end{tabular}



In [19]:
results_topic = results[results.columns[14:]] 
print(results_topic.to_latex(index=False))  

\begin{tabular}{llllll}
\toprule
trump\_loss & biden\_president & election\_fraud &      covid & anti\_trump & trump\_last\_month \\
\midrule
     trump &           biden &          trump &      trump &      trump &            trump \\
      says &           trump &       election &   election &   election &        president \\
     biden &            says &          court &      court &      court &         election \\
  election &        election &          biden &      biden &      biden &            court \\
 president &       president &           says &       says &       says &            biden \\
    donald &          donald &      president &  president &  president &             says \\
   georgia &         georgia &         donald &     donald &     donald &           donald \\
   college &         college &        georgia &    georgia &    georgia &          georgia \\
     leave &           leave &        college &    college &    college &          college \\
 electoral &