In [100]:
import pandas as pd
import numpy as np
from itertools import combinations
from collections import Counter

In [101]:
df_articles = pd.read_csv('../data/raw/candidate_articles.csv')

# Which candidates share articles?

In [102]:
def process_query(df, acc, candidate_names=None):
    df.groupby('url').apply(lambda grp: shared_query(grp, acc))

def shared_query(grp, acc):
    if len(grp['query']):
        cand_names = grp['query'].sort_values().values
        for combo in combinations(cand_names, 2):
            acc[combo] += 1
            
def process_description(df, acc, candidate_names):
    df.description.map(lambda d: shared_description(d, acc, candidate_names))
            
def shared_description(desc, acc, candidate_names):
    if len(desc):
        for name in candidate_names:
            for pair in [x for x in candidate_names if x != name]:
                desc = desc.lower()
                if name in desc and pair in desc:
                    acc[(name, pair)] += 1
        

def find_shared_articles(article_df, shared_property):
    shared_articles = Counter()
    candidates = ['biden', 'bloomberg', 'buttigieg', 'klobuchar', 'sanders', 'warren']
    process_fns = {'query': process_query, 'desc': process_description}
    
    process_fns[shared_property](article_df, shared_articles, candidates)
    
    results_df = pd.DataFrame(np.zeros((6,6)), columns=candidates, index=candidates)
    
    for (k, j), v in shared_articles.items():
        results_df.loc[k.lower(), j.lower()] = v
        results_df.loc[j.lower(), k.lower()] = v
    
    return results_df

In [103]:
shared_articles_summary_by_desc = find_shared_articles(df_articles, 'desc')
shared_articles_summary_by_desc

Unnamed: 0,biden,bloomberg,buttigieg,klobuchar,sanders,warren
biden,0.0,30.0,52.0,50.0,104.0,31.0
bloomberg,30.0,0.0,0.0,16.0,66.0,49.0
buttigieg,52.0,0.0,0.0,58.0,149.0,25.0
klobuchar,50.0,16.0,58.0,0.0,41.0,15.0
sanders,104.0,66.0,149.0,41.0,0.0,52.0
warren,31.0,49.0,25.0,15.0,52.0,0.0


In [104]:
shared_articles_summary_by_query = find_shared_articles(df_articles, 'query')
shared_articles_summary_by_query

Unnamed: 0,biden,bloomberg,buttigieg,klobuchar,sanders,warren
biden,239.0,295.0,490.0,495.0,509.0,367.0
bloomberg,295.0,172.0,249.0,323.0,312.0,277.0
buttigieg,490.0,249.0,268.0,601.0,499.0,493.0
klobuchar,495.0,323.0,601.0,272.0,487.0,490.0
sanders,509.0,312.0,499.0,487.0,246.0,383.0
warren,367.0,277.0,493.0,490.0,383.0,237.0


# Who's mentioned the most in the text?

In [105]:
candidates = ['biden', 'bloomberg', 'buttigieg', 'klobuchar', 'sanders', 'warren']

In [127]:
from nltk import word_tokenize
from collections import Counter
from functools import reduce

def count_mentions(text):
    if type(text) != str and np.isnan(text):
        return Counter()
    tokens = word_tokenize(text.lower())
    return Counter([w for w in tokens if w in candidates])

def sum_counts(counter1, counter2):
    for k, v in counter2.items():
        counter1[k] += v
    return counter1

def aggregate_counts(counts):
    return reduce(sum_counts, counts, Counter())

def get_total_mentions(article_texts):
    counts = [count_mentions(x) for x in article_texts]
    return aggregate_counts(counts)

In [153]:
df_articles['publishedAt'] = df_articles.publishedAt.map(pd.to_datetime)
df_articles['published_date'] = df_articles.publishedAt.map(lambda d: d.date())
grp = df_articles.drop_duplicates(subset=['url']).groupby('published_date').text.apply(get_total_mentions).reset_index()
mentions_by_day = grp.pivot(index='published_date', columns='level_1', values=['text'])
mentions_by_day

Unnamed: 0_level_0,text,text,text,text,text,text
level_1,biden,bloomberg,buttigieg,klobuchar,sanders,warren
published_date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
2020-02-03,38.0,6.0,14.0,17.0,18.0,15.0
2020-02-04,38.0,17.0,42.0,9.0,28.0,16.0
2020-02-05,35.0,3.0,9.0,1.0,15.0,2.0
2020-02-06,27.0,4.0,14.0,7.0,17.0,25.0
2020-02-07,14.0,6.0,42.0,22.0,45.0,5.0
2020-02-08,76.0,6.0,71.0,32.0,72.0,25.0
2020-02-09,84.0,2.0,57.0,17.0,49.0,27.0
2020-02-10,31.0,2.0,15.0,10.0,27.0,78.0
2020-02-11,92.0,32.0,55.0,60.0,92.0,42.0
2020-02-12,38.0,24.0,55.0,91.0,90.0,44.0


# By Candidate Query - Who is mentioned?

In [155]:
grp = df_articles.drop_duplicates(subset=['url', 'query']).groupby('query').text.apply(get_total_mentions).reset_index()
grp.pivot(index='query', columns='level_1', values='text')


level_1,biden,bloomberg,buttigieg,klobuchar,sanders,warren
query,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Biden,787,175,199,111,485,107
Bloomberg,230,778,82,75,366,164
Buttigieg,361,127,500,162,483,204
Klobuchar,334,183,322,447,460,260
Sanders,297,177,230,129,715,124
Warren,279,400,240,176,521,666


# By Candidate Query - Sentiment

## Article Description

In [157]:
df_articles.drop_duplicates(subset=['url', 'query']).groupby('query').agg({'desc_sentiment': 'mean'}).reset_index()

Unnamed: 0,query,desc_sentiment
0,Biden,0.125579
1,Bloomberg,0.061001
2,Buttigieg,0.148596
3,Klobuchar,0.111751
4,Sanders,0.208267
5,Warren,0.046759


In [173]:
df_articles.drop_duplicates(subset=['url', 'query'])\
    .groupby('query').desc_sentiment.apply(lambda s: len(s[s < 0])).reset_index()\
    .rename(columns={'desc_sentiment': 'perc_neg_desc'})

Unnamed: 0,query,perc_neg_desc
0,Biden,27
1,Bloomberg,32
2,Buttigieg,21
3,Klobuchar,21
4,Sanders,20
5,Warren,32


## Full Text

In [168]:
df_articles.drop_duplicates(subset=['url', 'query']).groupby('query').agg({'full_text_sentiment': 'mean'}).reset_index()

Unnamed: 0,query,full_text_sentiment
0,Biden,0.492979
1,Bloomberg,0.35616
2,Buttigieg,0.577375
3,Klobuchar,0.502355
4,Sanders,0.553658
5,Warren,0.495343


In [172]:
df_articles.drop_duplicates(subset=['url', 'query'])\
    .groupby('query').full_text_sentiment.apply(lambda s: len(s[s < 0])).reset_index()\
    .rename(columns={'full_text_sentiment': 'perc_neg_full_text'})

Unnamed: 0,query,perc_neg_full_text
0,Biden,17
1,Bloomberg,24
2,Buttigieg,8
3,Klobuchar,11
4,Sanders,15
5,Warren,15


# Tonality

## Description

In [175]:
tones = ['analytical', 'anger', 'confident', 'fear', 'joy', 'sadness', 'tentative']

for t in tones:
    print(df_articles.drop_duplicates(subset=['url', 'query'])\
        .groupby('query').agg({f'desc_{t}': 'mean'}).reset_index())

       query  desc_analytical
0      Biden         0.690624
1  Bloomberg         0.758291
2  Buttigieg         0.719061
3  Klobuchar         0.713031
4    Sanders         0.731496
5     Warren         0.699561
       query  desc_anger
0      Biden         NaN
1  Bloomberg    0.522450
2  Buttigieg    0.508238
3  Klobuchar         NaN
4    Sanders         NaN
5     Warren    0.508238
       query  desc_confident
0      Biden        0.630238
1  Bloomberg        0.626416
2  Buttigieg        0.726172
3  Klobuchar        0.717560
4    Sanders        0.543435
5     Warren        0.768576
       query  desc_fear
0      Biden        NaN
1  Bloomberg        NaN
2  Buttigieg   0.543700
3  Klobuchar   0.561740
4    Sanders   0.546448
5     Warren   0.552624
       query  desc_joy
0      Biden  0.602183
1  Bloomberg  0.597751
2  Buttigieg  0.616279
3  Klobuchar  0.610986
4    Sanders  0.654158
5     Warren  0.621573
       query  desc_sadness
0      Biden      0.589748
1  Bloomberg      0.523847
2 

In [178]:
tones = ['analytical', 'anger', 'confident', 'fear', 'joy', 'sadness', 'tentative']

for t in tones:
    print(df_articles.drop_duplicates(subset=['url', 'query'])\
        .groupby('query')['desc_' + t].apply(lambda s: len(s[s > 0])).reset_index()\
        .rename(columns={f'desc_{t}': f'perc_{t}'}))

       query  perc_analytical
0      Biden               24
1  Bloomberg               23
2  Buttigieg               30
3  Klobuchar               30
4    Sanders               30
5     Warren               22
       query  perc_anger
0      Biden           0
1  Bloomberg           1
2  Buttigieg           1
3  Klobuchar           0
4    Sanders           0
5     Warren           1
       query  perc_confident
0      Biden               3
1  Bloomberg               4
2  Buttigieg               3
3  Klobuchar               5
4    Sanders               2
5     Warren               5
       query  perc_fear
0      Biden          0
1  Bloomberg          0
2  Buttigieg          2
3  Klobuchar          1
4    Sanders          1
5     Warren          2
       query  perc_joy
0      Biden        17
1  Bloomberg        14
2  Buttigieg        19
3  Klobuchar        15
4    Sanders         9
5     Warren        11
       query  perc_sadness
0      Biden             4
1  Bloomberg             2
2 

## Full Text

In [179]:
tones = ['analytical', 'anger', 'confident', 'fear', 'joy', 'sadness', 'tentative']

for t in tones:
    print(df_articles.drop_duplicates(subset=['url', 'query'])\
        .groupby('query').agg({f'full_text_{t}': 'mean'}).reset_index())

       query  full_text_analytical
0      Biden              0.644415
1  Bloomberg              0.672291
2  Buttigieg              0.675362
3  Klobuchar              0.649617
4    Sanders              0.691609
5     Warren              0.642039
       query  full_text_anger
0      Biden         0.553408
1  Bloomberg         0.601881
2  Buttigieg         0.581138
3  Klobuchar         0.600582
4    Sanders         0.582045
5     Warren         0.548879
       query  full_text_confident
0      Biden             0.842017
1  Bloomberg             0.964027
2  Buttigieg             0.742772
3  Klobuchar                  NaN
4    Sanders             0.780650
5     Warren             0.532529
       query  full_text_fear
0      Biden        0.564674
1  Bloomberg        0.620495
2  Buttigieg        0.523748
3  Klobuchar        0.513644
4    Sanders        0.538991
5     Warren        0.579506
       query  full_text_joy
0      Biden       0.563696
1  Bloomberg       0.549207
2  Buttigieg       0

In [182]:
tones = ['analytical', 'anger', 'confident', 'fear', 'joy', 'sadness', 'tentative']

for t in tones:
    print(df_articles.drop_duplicates(subset=['url', 'query'])\
        .groupby('query')['full_text_' + t].apply(lambda s: len(s[s > 0])).reset_index()\
        .rename(columns={f'full_text_{t}': f'perc_{t}'}))
    print('---')

       query  perc_analytical
0      Biden               46
1  Bloomberg               44
2  Buttigieg               40
3  Klobuchar               39
4    Sanders               51
5     Warren               41
---
       query  perc_anger
0      Biden           3
1  Bloomberg           7
2  Buttigieg           3
3  Klobuchar           4
4    Sanders           1
5     Warren           6
---
       query  perc_confident
0      Biden               1
1  Bloomberg               1
2  Buttigieg               2
3  Klobuchar               0
4    Sanders               4
5     Warren               1
---
       query  perc_fear
0      Biden          4
1  Bloomberg          6
2  Buttigieg          1
3  Klobuchar          1
4    Sanders          4
5     Warren          4
---
       query  perc_joy
0      Biden        53
1  Bloomberg        47
2  Buttigieg        56
3  Klobuchar        53
4    Sanders        44
5     Warren        55
---
       query  perc_sadness
0      Biden            35
1  Bloomb

# By Sentences

In [183]:
# have the first 100 sentences of each article (ish)
# dedupe based on url
# if a candidate is mentioned in the sentence => add one to each tone for that candidate, also add count of sentences

In [192]:
df_sentences = pd.read_csv('../data/raw/articles_sentences.csv')

## Overall Sentence Counts

In [225]:
candidate_sent_counts = get_total_mentions(df_sentences.drop_duplicates(subset=['url', 'sentence_id']).text.values)
candidate_sent_counts

Counter({'klobuchar': 550,
         'buttigieg': 741,
         'sanders': 1399,
         'biden': 1127,
         'bloomberg': 1012,
         'warren': 807})

## Average Tone Score

In [226]:
df_tone_means = pd.DataFrame()
for cand_name in ['biden', 'sanders', 'warren', 'buttigieg', 'klobuchar', 'bloomberg']:
    res = df_sentences.drop_duplicates(subset=['url', 'sentence_id'])\
        [df_sentences.text.str.lower().str.contains(cand_name, na=False)]\
        .drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'sentence_id'])\
        .agg('mean')
    row = pd.DataFrame(res).transpose()
    row['candidate'] = cand_name
    df_tone_means = df_tone_means.append(row)
df_tone_means

  after removing the cwd from sys.path.


Unnamed: 0,analytical_score,anger_score,confident_score,fear_score,joy_score,sadness_score,tentative_score,candidate
0,0.69499,0.585095,0.695641,0.635482,0.634871,0.610058,0.726799,biden
0,0.704767,0.583485,0.687286,0.609721,0.619563,0.595516,0.703899,sanders
0,0.688087,0.605417,0.654411,0.627549,0.611048,0.598888,0.725236,warren
0,0.710062,0.619972,0.68982,0.658083,0.610898,0.58338,0.684643,buttigieg
0,0.693422,0.556284,0.692522,0.615017,0.626487,0.567478,0.731297,klobuchar
0,0.693179,0.599626,0.711902,0.625188,0.623037,0.606333,0.720134,bloomberg


## Count of Sentences

In [232]:
df_tone_counts = pd.DataFrame()
for cand_name in ['biden', 'sanders', 'warren', 'buttigieg', 'klobuchar', 'bloomberg']:
    res = df_sentences.drop_duplicates(subset=['url', 'sentence_id'])\
        [df_sentences.text.str.lower().str.contains(cand_name, na=False)]\
        .drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'sentence_id'])\
        .agg('count')
    row = pd.DataFrame(res).transpose()
    row['candidate'] = cand_name
    df_tone_counts = df_tone_counts.append(row)
df_tone_counts = df_tone_counts.drop(columns=['url', 'query']).rename(columns={'text': 'total_sentences'})
df_tone_counts

  after removing the cwd from sys.path.


Unnamed: 0,total_sentences,analytical_score,anger_score,confident_score,fear_score,joy_score,sadness_score,tentative_score,candidate
0,1089,292,21,61,17,203,117,135,biden
0,1335,428,8,72,20,170,116,153,sanders
0,781,197,7,30,10,117,78,89,warren
0,712,203,10,31,5,108,75,68,buttigieg
0,543,143,4,33,6,98,52,58,klobuchar
0,960,262,24,39,16,105,75,101,bloomberg


## Percentage of Sentences

In [233]:
df_tone_perc = df_tone_counts.copy()
for col in df_tone_perc.columns[1:-1]:
    df_tone_perc[col] = df_tone_perc[col] / df_tone_perc['total_sentences']
df_tone_perc

Unnamed: 0,total_sentences,analytical_score,anger_score,confident_score,fear_score,joy_score,sadness_score,tentative_score,candidate
0,1089,0.268136,0.019284,0.056015,0.015611,0.18641,0.107438,0.123967,biden
0,1335,0.320599,0.005993,0.053933,0.014981,0.127341,0.086891,0.114607,sanders
0,781,0.252241,0.008963,0.038412,0.012804,0.149808,0.099872,0.113956,warren
0,712,0.285112,0.014045,0.043539,0.007022,0.151685,0.105337,0.095506,buttigieg
0,543,0.263352,0.007366,0.060773,0.01105,0.180479,0.095764,0.106814,klobuchar
0,960,0.272917,0.025,0.040625,0.016667,0.109375,0.078125,0.105208,bloomberg


# Week Over Week

In [243]:
merged = df_sentences.merge(df_articles.drop_duplicates(subset=['url'])[['url', 'publishedAt']], on='url', how='left')
merged['publishedAt'] = merged['publishedAt'].map(pd.to_datetime)

In [254]:
df_tone_counts_wk = pd.DataFrame()
for cand_name in ['biden', 'sanders', 'warren', 'buttigieg', 'klobuchar', 'bloomberg']:
    res = merged.drop_duplicates(subset=['url', 'sentence_id'])\
        [merged.text.str.lower().str.contains(cand_name, na=False)]\
        .drop(columns=['Unnamed: 0', 'Unnamed: 0.1', 'sentence_id'])\
        .groupby(merged.publishedAt.dt.week)\
        .agg('count')
    row = pd.DataFrame(res)
    row['candidate'] = cand_name
    df_tone_counts_wk = df_tone_counts_wk.append(row)
df_tone_counts_wk = df_tone_counts_wk.drop(columns=['url', 'query']).rename(columns={'text': 'total_sentences'})
df_tone_counts_wk

  after removing the cwd from sys.path.


Unnamed: 0_level_0,total_sentences,analytical_score,anger_score,confident_score,fear_score,joy_score,sadness_score,tentative_score,publishedAt,candidate
publishedAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6,281,79,8,12,6,43,36,38,281,biden
7,216,55,6,9,3,45,29,25,216,biden
8,168,46,2,7,4,22,10,21,168,biden
9,344,93,5,27,4,74,36,46,344,biden
10,80,19,0,6,0,19,6,5,80,biden
6,221,59,1,9,0,25,17,24,221,sanders
7,217,60,0,12,6,42,22,26,217,sanders
8,392,139,2,23,5,49,18,50,392,sanders
9,445,155,5,27,6,43,55,51,445,sanders
10,60,15,0,1,3,11,4,2,60,sanders


In [255]:
df_tone_perc_wk = df_tone_counts_wk.copy()
for col in df_tone_perc_wk.columns[1:-1]:
    df_tone_perc_wk[col] = df_tone_perc_wk[col] / df_tone_perc_wk['total_sentences']
df_tone_perc_wk

Unnamed: 0_level_0,total_sentences,analytical_score,anger_score,confident_score,fear_score,joy_score,sadness_score,tentative_score,publishedAt,candidate
publishedAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
6,281,0.281139,0.02847,0.042705,0.021352,0.153025,0.128114,0.135231,1.0,biden
7,216,0.25463,0.027778,0.041667,0.013889,0.208333,0.134259,0.115741,1.0,biden
8,168,0.27381,0.011905,0.041667,0.02381,0.130952,0.059524,0.125,1.0,biden
9,344,0.270349,0.014535,0.078488,0.011628,0.215116,0.104651,0.133721,1.0,biden
10,80,0.2375,0.0,0.075,0.0,0.2375,0.075,0.0625,1.0,biden
6,221,0.266968,0.004525,0.040724,0.0,0.113122,0.076923,0.108597,1.0,sanders
7,217,0.276498,0.0,0.0553,0.02765,0.193548,0.101382,0.119816,1.0,sanders
8,392,0.354592,0.005102,0.058673,0.012755,0.125,0.045918,0.127551,1.0,sanders
9,445,0.348315,0.011236,0.060674,0.013483,0.096629,0.123596,0.114607,1.0,sanders
10,60,0.25,0.0,0.016667,0.05,0.183333,0.066667,0.033333,1.0,sanders
