Import packages

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import re
import json

Load data and pre-process the format

In [2]:
topics = ['trump_loss','biden_president','election_fraud','covid','anti_trump','trump_last_month']

cons = pd.read_csv("data/cons_25-27.csv")
poli = pd.read_csv("data/poli_25-27.csv")

#all posts
df = pd.concat([cons,poli],ignore_index=True)

In [3]:
# one df for each topic x cons/poli
cons_dict = dict()
for topic in topics:
    cons_dict[topic] = cons.loc[cons[topic] == 1]['data.title']
 
poli_dict = dict()
for topic in topics:
    poli_dict[topic] = poli.loc[poli[topic] == 1]['data.title']


In [4]:
all_post_dict = dict()
all_post_dict['cons'] = cons_dict
all_post_dict['poli'] = poli_dict


In [5]:
from collections import MutableMapping 
  
# Convert ini_dict to flattened dictionary 
# default seperater '_' 
def convert_flatten(d, parent_key ='', sep ='_'): 
    items = [] 
    for k, v in d.items(): 
        new_key = parent_key + sep + k if parent_key else k 
        if isinstance(v, MutableMapping): 
            items.extend(convert_flatten(v, new_key, sep = sep).items()) 
        else: 
            items.append((new_key, v)) 
    return dict(items) 
all_posts_dict = convert_flatten(all_post_dict)

  from collections import MutableMapping


In [6]:
posts = list(df['data.title'])
all_posts = '. '.join(posts)
all_posts = re.sub('[^A-Za-z]+', ' ', all_posts)
all_posts = all_posts.lower()


Calculate the TD-IDF

In [7]:
def word_count(posts, topic):
    '''
    Count word frequency for each word in all posts or the posts of the given topic
    
    Parameters
    ----------
    posts : str
        a long String containing all prosessed sentences
    topic : str
        name of the topic, or "total"
        
    Returns
    -------
    wordCounts : pandas.DataFrame
        contains the word frequencies
    '''
    counts = dict()
    words = posts.split()
    temp = [w for w in words if w not in ['s','ll','re','t','d','m']]
    words = temp
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
            
    wordCounts =  pd.DataFrame(counts.items(), columns=['word', topic])
    #sort = sort.sort_values(by=['topic1'], ascending = False)
    wordCounts = wordCounts[wordCounts[topic] >=5]
    wordCounts = wordCounts.reset_index(drop=True)

    return wordCounts


In [8]:
# idf
def idf(wordCountsAll): 
    '''
    Adds a column of idf to the dataframe
    '''
    wordCountsAll.loc['total']= wordCountsAll.sum(numeric_only=True)
    wordCountsAll['total'] = wordCountsAll.sum(numeric_only=True,axis=1)
    grandSum = wordCountsAll['total']['total']
    wordCountsAll['idf']=np.log(grandSum/wordCountsAll['total'])


In [9]:
word_counts = word_count(all_posts, 'total')
idf(word_counts)
word_counts

Unnamed: 0,word,total,idf
0,after,29.0,4.812745
1,trump,335.0,2.365910
2,supporters,12.0,5.695134
3,on,53.0,4.209749
4,his,39.0,4.516479
...,...,...,...
192,mocked,5.0,6.570603
193,ever,5.0,6.570603
194,talk,7.0,6.234131
195,wants,5.0,6.570603


In [10]:
# word counts for each df of party_topic
for party_topic in list(all_posts_dict.keys()):
    posts = list(all_posts_dict[party_topic])
    temp = '.'.join(posts)
    temp = re.sub('[^A-Za-z]+', ' ', temp)
    temp = temp.lower()
    all_posts_dict[party_topic] = str(temp)
    temp_counts = word_count(temp, party_topic)
    word_counts = pd.merge(left=word_counts, right=temp_counts,how='left', on='word')


In [11]:
word_counts

Unnamed: 0,word,total,idf,cons_trump_loss,cons_biden_president,cons_election_fraud,cons_covid,cons_anti_trump,cons_trump_last_month,poli_trump_loss,poli_biden_president,poli_election_fraud,poli_covid,poli_anti_trump,poli_trump_last_month
0,after,29.0,4.812745,,,,,,,6.0,8.0,,,13.0,6.0
1,trump,335.0,2.365910,27.0,7.0,24.0,5.0,14.0,31.0,95.0,39.0,53.0,11.0,103.0,105.0
2,supporters,12.0,5.695134,,,,,,,,,,,,
3,on,53.0,4.209749,,,5.0,,,,10.0,14.0,5.0,,9.0,11.0
4,his,39.0,4.516479,,,,,,,6.0,8.0,5.0,,12.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,mocked,5.0,6.570603,,,,,,,,,,,5.0,
193,ever,5.0,6.570603,,,,,,,,,,,,
194,talk,7.0,6.234131,,,,,,,,,,,,
195,wants,5.0,6.570603,,,,,,,,,,,,


In [12]:
word_counts['cons'] = word_counts['cons_trump_loss']+word_counts['cons_biden_president']+word_counts['cons_election_fraud']+word_counts['cons_covid']+word_counts['cons_anti_trump']+word_counts['cons_trump_last_month']
word_counts['poli'] = word_counts['poli_trump_loss']+word_counts['poli_biden_president']+word_counts['poli_election_fraud']+word_counts['poli_covid']+word_counts['poli_anti_trump']+word_counts['poli_trump_last_month']

word_counts['trump_loss']=word_counts['cons_trump_loss']+word_counts['poli_trump_loss']
word_counts['biden_president']=word_counts['cons_biden_president']+word_counts['poli_biden_president']
word_counts['election_fraud']=word_counts['cons_election_fraud']+word_counts['poli_election_fraud']
word_counts['covid']=word_counts['cons_covid']+word_counts['poli_covid']
word_counts['anti_trump']=word_counts['cons_anti_trump']+word_counts['poli_anti_trump']
word_counts['trump_last_month']=word_counts['cons_trump_last_month']+word_counts['poli_trump_last_month']
word_counts

Unnamed: 0,word,total,idf,cons_trump_loss,cons_biden_president,cons_election_fraud,cons_covid,cons_anti_trump,cons_trump_last_month,poli_trump_loss,...,poli_anti_trump,poli_trump_last_month,cons,poli,trump_loss,biden_president,election_fraud,covid,anti_trump,trump_last_month
0,after,29.0,4.812745,,,,,,,6.0,...,13.0,6.0,,,,,,,,
1,trump,335.0,2.365910,27.0,7.0,24.0,5.0,14.0,31.0,95.0,...,103.0,105.0,108.0,406.0,122.0,46.0,77.0,16.0,117.0,136.0
2,supporters,12.0,5.695134,,,,,,,,...,,,,,,,,,,
3,on,53.0,4.209749,,,5.0,,,,10.0,...,9.0,11.0,,,,,10.0,,,
4,his,39.0,4.516479,,,,,,,6.0,...,12.0,8.0,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,mocked,5.0,6.570603,,,,,,,,...,5.0,,,,,,,,,
193,ever,5.0,6.570603,,,,,,,,...,,,,,,,,,,
194,talk,7.0,6.234131,,,,,,,,...,,,,,,,,,,
195,wants,5.0,6.570603,,,,,,,,...,,,,,,,,,,


In [13]:
def tfidf(wordCountsAll):
    '''
    Calculates the TD-IDF for words in each topics
    
    Parameter
    ---------
    wordCountsAll : pandas.DataFrame
        df of word, topics, total, idf; entries: word frequency
    
    Returns
    -------
    tfidf : pandas.DataFrame
        df of word, topics; entries : tfidf
    '''
    tfidf = wordCountsAll.copy()
    for column in tfidf.columns:
        if column not in ['word','total','idf']:
            tfidf[column]=tfidf[column]*tfidf['idf']
            
    tfidf = tfidf.drop(['total','idf'], axis=1)

    return tfidf

tfidf_values = tfidf(word_counts)
tfidf_values

Unnamed: 0,word,cons_trump_loss,cons_biden_president,cons_election_fraud,cons_covid,cons_anti_trump,cons_trump_last_month,poli_trump_loss,poli_biden_president,poli_election_fraud,...,poli_anti_trump,poli_trump_last_month,cons,poli,trump_loss,biden_president,election_fraud,covid,anti_trump,trump_last_month
0,after,,,,,,,28.876469,38.501959,,...,62.565684,28.876469,,,,,,,,
1,trump,63.879575,16.561371,56.781845,11.829551,33.122743,73.343216,224.761468,92.270497,125.393240,...,243.688750,248.420570,255.518301,960.559538,288.641043,108.831869,182.175085,37.854563,276.811492,321.763786
2,supporters,,,,,,,,,,...,,,,,,,,,,
3,on,,,21.048744,,,,42.097488,58.936483,21.048744,...,37.887739,46.307237,,,,,42.097488,,,
4,his,,,,,,,27.098874,36.131833,22.582395,...,54.197749,36.131833,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,mocked,,,,,,,,,,...,32.853014,,,,,,,,,
193,ever,,,,,,,,,,...,,,,,,,,,,
194,talk,,,,,,,,,,...,,,,,,,,,,
195,wants,,,,,,,,,,...,,,,,,,,,,


In [14]:
# remove stop words
import nltk
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
tfidf_values = tfidf_values[~tfidf_values['word'].isin(stop_words)]
tfidf_values

Unnamed: 0,word,cons_trump_loss,cons_biden_president,cons_election_fraud,cons_covid,cons_anti_trump,cons_trump_last_month,poli_trump_loss,poli_biden_president,poli_election_fraud,...,poli_anti_trump,poli_trump_last_month,cons,poli,trump_loss,biden_president,election_fraud,covid,anti_trump,trump_last_month
1,trump,63.879575,16.561371,56.781845,11.829551,33.122743,73.343216,224.761468,92.270497,125.39324,...,243.688750,248.420570,255.518301,960.559538,288.641043,108.831869,182.175085,37.854563,276.811492,321.763786
2,supporters,,,,,,,,,,...,,,,,,,,,,
11,president,21.340089,,,,,21.340089,55.484230,51.216213,,...,55.484230,42.680177,,,76.824319,,,,,64.020266
12,donald,24.239181,,,,,,43.630526,,,...,43.630526,48.478362,,,67.869707,,,,,
13,voters,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
192,mocked,,,,,,,,,,...,32.853014,,,,,,,,,
193,ever,,,,,,,,,,...,,,,,,,,,,
194,talk,,,,,,,,,,...,,,,,,,,,,
195,wants,,,,,,,,,,...,,,,,,,,,,


Get the words with highest TF-IDF for each topics and subreddits

In [15]:
def word_list(tfidf_table):
    '''
    For each topic, extract 10 words with highest tfidf
    
    Parameter
    ---------
    tfidf_table : pandas.DataFrame
        df of word, topics; entries : tfidf
    
    Returns
    -------
    word_list : dict
        keys are the topics and values are lists containing the top 10 words
    '''
    sort = tfidf_table.copy()
    word_list = dict()
    for topic in list(tfidf_table.columns):
        if topic != 'word':
        #words = list()
            sort = sort.sort_values(by=[topic], ascending = False)
            words = list(sort['word'][:10])
            
            word_list[topic] = words

    return word_list
top_words = word_list(tfidf_values)

In [16]:
# writes to file
with open('top_words.json','w') as f:
    json.dump(top_words, f)

In [17]:
# for latex table
results = pd.DataFrame.from_dict(top_words)

In [18]:
# /r/conservative: topics
results_cons = results[results.columns[0:6]] 
print(results_cons.to_latex(index=False))  

\begin{tabular}{llllll}
\toprule
cons\_trump\_loss & cons\_biden\_president & cons\_election\_fraud &   cons\_covid & cons\_anti\_trump & cons\_trump\_last\_month \\
\midrule
          trump &                biden &               trump &  coronavirus &           trump &                 trump \\
          biden &                trump &              nevada &        trump &     coronavirus &                 flynn \\
           says &                 says &               court &       nevada &          nevada &               pardons \\
         donald &               donald &              ballot &        court &           court &               michael \\
      president &            president &               could &       ballot &          ballot &              security \\
       election &             election &                lets &        could &           could &                former \\
     supporters &           supporters &             present &         lets &            lets &    

In [19]:
# /r/politics: topics
results_poli = results[results.columns[6:12]] 
print(results_poli.to_latex(index=False))  

\begin{tabular}{llllll}
\toprule
poli\_trump\_loss & poli\_biden\_president & poli\_election\_fraud &    poli\_covid & poli\_anti\_trump & poli\_trump\_last\_month \\
\midrule
          trump &                biden &               trump &  thanksgiving &           trump &                 trump \\
           says &                trump &            election &      pandemic &        election &                  says \\
       election &            president &               fraud &         trump &            says &               georgia \\
          biden &                  joe &               voter &         biden &           fraud &               college \\
          leave &                 says &               group &      election &       president &              election \\
        college &       administration &                 pro &         fraud &         georgia &                 leave \\
          house &         thanksgiving &               donor &         voter &           bid

In [20]:
# /r/conservative vs /r/politics
results_state = results[results.columns[12:14]] 
print(results_state.to_latex(index=False))  

\begin{tabular}{ll}
\toprule
      cons &       poli \\
\midrule
     trump &      trump \\
      says &      biden \\
   georgia &       says \\
   college &    georgia \\
  election &    college \\
     leave &   election \\
 electoral &      leave \\
     house &  electoral \\
     white &      house \\
     biden &      white \\
\bottomrule
\end{tabular}



In [21]:
# two subreddits: topics
results_topic = results[results.columns[14:]] 
print(results_topic.to_latex(index=False))  

\begin{tabular}{llllll}
\toprule
trump\_loss & biden\_president & election\_fraud &      covid & anti\_trump & trump\_last\_month \\
\midrule
     trump &           biden &          trump &      trump &      trump &            trump \\
      says &           trump &       election &   election &   election &        president \\
     biden &            says &          court &      court &      court &         election \\
  election &        election &          biden &      biden &      biden &            court \\
 president &       president &           says &       says &       says &            biden \\
    donald &          donald &      president &  president &  president &             says \\
   georgia &         georgia &         donald &     donald &     donald &           donald \\
   college &         college &        georgia &    georgia &    georgia &          georgia \\
     leave &           leave &        college &    college &    college &          college \\
 electoral &