In [25]:
import bokeh.palettes

In [94]:
import pandas as pd
import numpy as np
from tqdm import trange

import sys
import glob
import os

from bokeh.core.properties import value
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool, DataRange1d, Legend
from bokeh.palettes import brewer
output_notebook()

In [55]:
def headlines(search_directories = [''], filename_filter = "*"):

    # Get a list of matching files
    files = []

    for i in range(len(search_directories)):
        files.append(sorted(glob.glob(os.path.join(search_directories[i], filename_filter))))

    files = [item for items in files for item in items] # shorthand to flatten the list of lists

    # Read in files and concatenate
    data = []

    for i in trange(len(files)):
        new_frame = pd.read_csv(files[i], header=0, parse_dates=['publishedAt'])
        new_frame.columns = new_frame.columns.str.replace(".", "_")
        data.append(new_frame)

    return pd.concat(data, sort=False)

In [51]:
!ls ../datastore/processed/

abc-news	  cnn	    msnbc     the-new-york-times
associated-press  fox-news  nbc-news  the-washington-post


In [57]:
data = headlines(
    ['../datastore/processed/abc-news/',
     '../datastore/processed/associated-press/',
    '../datastore/processed/fox-news/',
    '../datastore/processed/msnbc/',
    '../datastore/processed/nbc-news/',
    '../datastore/processed/the-new-york-times/',
    '../datastore/processed/the-washington-post/'],
    '*-2020-*-*')

data = data.set_index('publishedAt', drop=True)
data = data.drop_duplicates()

100%|██████████| 1591/1591 [00:05<00:00, 270.10it/s]


In [58]:
len(data)

246241

In [59]:
def subset_by_text(_data, includes=None, excludes=None, search_field="title"):
    """Count the number of occurrences of text in a dataframe column.
    
    Arguments:
        includes (list): a list of strings for which to search. Each string is searched
        via regex and the result is subsetted in the order they are provided.
        
        excludes (list): a list of strings to exclude. These are executed after the includes
        list, in the order they are provided, and are exclusive. E.g., to keep occurrences
        that have both a word in the includes and in the excludes, you need to devise a
        a single regex-style string and provide it via the includes list.
        
        sources (list): list of elements from source_id to include.
        
        frequency (string): passed to pandas.DataFrame.resample
        
        search_field (string): the column name to search
        
    Returns:
        (pd.DataFrame): a new dataframe containing the counts of results from each source.
    """
    if includes is not None:
        for include_word in includes:
            result = _data[_data[search_field].str.contains(include_word, case=False, na=False)].copy()
        description_text = "Result includes " + ", ".join('"' + include + '"' for include in includes)
        
    else:
        return

    if excludes is not None:
        for exclude_word in excludes:
            result = result[~result[search_field].str.contains(exclude_word, case=False, na=False)]
        
        description_text = description_text + " and excludes " + ", ".join('"' + exclude + '"' for exclude in excludes)

    result.desc = description_text
            
    return result

In [122]:
def count_by_source(_data, freq="D"):
    """Count the number of rows attribtued to each source_name."""
    
    grouper = _data.groupby([pd.Grouper(freq=freq), 'source_id'])
    time_group = grouper['title'].count().unstack('source_id').fillna(0)
    
    time_group.desc = _data.desc
    time_group.freq = freq
    
    return time_group

In [187]:
# retrieved on 2020.09.04 from https://www.adfontesmedia.com/interactive-media-bias-chart-2/
media_config = {
    "fox-news": {"label": "Fox News", "media_bias":22.23, "reliability":26.07, "color": "#E97452"},
    "associated-press": {"label": "Associated Press", "media_bias":-1.20, "reliability":53.71, "color": "#D3CECC"},
    "abc-news": {"label": "ABC News", "media_bias":-3.64, "reliability":49.45, "color": "#b6C7D5"},
    "the-washington-post": {"label": "The Washington Post", "media_bias":-6.13, "reliability":45.49,"color": "#87CEFB"},
    "the-new-york-times": {"label": "New York Times", "media_bias":-7.07, "reliability":47.49, "color": "#659BDF"},
    "nbc-news": {"label": "NBC News", "media_bias":-17.17, "reliability":38.15, "color": "#2234A8"},
    "msnbc": {"label": "MSNBC", "media_bias":-17.17, "reliability":38.15, "color": "#00008C"}
}
media_df = pd.DataFrame(media_config).transpose()

In [188]:
def bokeh_bar(_counts):
    
    bar_width_config = {
        "M":2000000000,
        "W":500000000,
        "D":50000000
    }
    
    print_time_config = {
        "M":"%B %Y",
        "W":"Week of %B %d, %Y",
        "D":"%B %d, %Y"
    }
    
    to_plot = _counts.copy()
    to_plot['showtime'] = to_plot.index.strftime(print_time_config[_counts.freq])

    source = ColumnDataSource(to_plot)

    categories = media_df.index
    labels = media_df['label'].to_list()
    
    tooltips = [
        ('Period', '@showtime'),
        ('Source', '$name'),
        ('Count', '@$name')
    ]
    tools = "xpan, xwheel_zoom, reset"

    p = figure(plot_width=900, plot_height=500, x_axis_type='datetime',
               tooltips = tooltips, tools=tools, y_range=DataRange1d(start=0),
               title=counts.desc)

    v = p.vbar_stack(categories, x='publishedAt', color=media_df['color'].to_list(),
                 width=bar_width_config[_counts.freq], source=source)

    legend = Legend(items=[(x, [v[i]]) for i, x in enumerate(labels)], location=(5, 100))
    p.add_layout(legend)
    p.legend[0].items.reverse()
    p.legend.location = "top_left"
    p.legend.border_line_color = None
    
    show(p)

In [189]:
subset = subset_by_text(data, includes=["virus|pandemic"])
counts = count_by_source(subset, freq="D")
bokeh_bar(counts)

In [190]:
subset = subset_by_text(data, includes=["BLM|Black Lives Matter"])
counts = count_by_source(subset, freq="D")
bokeh_bar(counts)

In [181]:
subset = subset_by_text(data, includes=["Kamala|Harris"], excludes=["Harrison", "Harris Falkner", "Colton Harris"])
counts = count_by_source(subset, freq="D")
bokeh_bar(counts)

In [182]:
def print_headlines(_df):
    print(len(_df), "headlines")
    print("=============")
    for i in range(len(_df)):
        print(i, '-', _df.iloc[i].name.strftime("%b %d, %Y"), "-", _df.iloc[i].title)

In [183]:
print_headlines(subset[subset["source_id"] == "the-new-york-times"])

107 headlines
0 - Jan 24, 2020 - Kamala Harris Is Said to Be Weighing an Endorsement of Joe Biden
1 - Feb 16, 2020 - Brian Noviski, Paul Harris
2 - Feb 18, 2020 - Jeremy O. Harris: Brandon Taylor ‘Subjugates Us With the Deft Hand of a Dom’
3 - Mar 08, 2020 - Kamala Harris Endorses Joe Biden for President
4 - Mar 17, 2020 - Barbara Harris, First Woman Ordained an Episcopal Bishop, Dies at 89
5 - Mar 22, 2020 - Brittany Harris, Naseem Beauchman
6 - May 11, 2020 - Why Kamala Harris Isn’t Clamoring to Be Biden’s Running Mate
7 - Jun 11, 2020 - Kamala Harris Is Done Explaining Racism
8 - Jun 12, 2020 - Kamala Harris, Front-runner (Again)
9 - Jun 22, 2020 - Maurice Harris Is a Superstar. Can He Bend Capitalism to His Talents?
10 - Jul 31, 2020 - Kamala Harris Pushes Back on Criticism of ‘Ambition’
11 - Aug 04, 2020 - Harris Faulkner, Working From Home in Shades of Blue
12 - Aug 04, 2020 - ‘Things Don’t Have to Be Matchy-Matchy’ and Other Design Tips From Harris Faulkner’s Mom
13 - Aug 09, 20

In [184]:
print_headlines(subset[subset["source_id"] == "fox-news"])

293 headlines
0 - Jan 03, 2020 - RP Harris to join with Nats after losing Game 7
1 - Jan 31, 2020 - Chiefs superfan Harris Faulkner ready for Super Bowl LIV victory: ‘50 years is a long time’
2 - Jan 31, 2020 - Chuck Schumer scolds Kamala Harris for laughing with Sherrod Brown at impeachment presser, goes viral
3 - Feb 05, 2020 - Dem Sen. Kyrsten Sinema stands to applaud Trump as Gillibrand, Harris stay seated at SOTU
4 - Feb 11, 2020 - Harris, No. 1 South Carolina earn first win over UConn 70-52
5 - Mar 08, 2020 - Kamala Harris becomes latest former candidate to endorse Joe Biden
6 - Mar 09, 2020 - What Harris, Buttigieg, Booker and other rivals said about Biden before endorsing him
7 - Mar 10, 2020 - Trevor Noah mocks Kamala Harris’ ‘hostage-style video’ endorsing Joe Biden
8 - Mar 11, 2020 - Harris Faulkner & Jimmy Talk Parenting During The Coronavirus Outbreak
9 - Mar 16, 2020 - Chris Harris Jr. finally hitting unfettered free agency
10 - Mar 18, 2020 - Tolman and Harris: Coronavir

In [191]:
subset = subset_by_text(data, includes=["Belarus"])
counts = count_by_source(subset, freq="D")
bokeh_bar(counts)

In [192]:
subset = subset_by_text(data, includes=["rips"], excludes=["strips"])
counts = count_by_source(subset, freq="W")
bokeh_bar(counts)

In [177]:
subset

Unnamed: 0_level_0,author,title,description,url,urlToImage,content,source_id,source_name
publishedAt,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2020-01-19 13:51:06+00:00,BILL BARROW Associated Press,Biden rips Sanders campaign for Social Securit...,Biden called for Sanders’ campaign to “disown”...,https://abcnews.go.com/Politics/wireStory/bide...,https://s.abcnews.com/images/Politics/WireAP_d...,"INDIANOLA, Iowa -- \r\nJoe Biden has called fo...",abc-news,ABC News
2020-02-05 04:00:42+00:00,Ella Torres,Nancy Pelosi rips up copy of State of the Unio...,Nancy Pelosi ripped up Trump's State of the Un...,https://abcnews.go.com/Politics/nancy-pelosi-r...,https://s.abcnews.com/images/Politics/sotu-pel...,In a moment that bookended an already divided ...,abc-news,ABC News
2020-02-13 00:00:15+00:00,Annette Gibbs,Expert Advice for Healthy Trips to Disney Parks,"TV personality, best-selling author and physic...",https://disneyparks.disney.go.com/blog/2020/02...,https://cdn1.parksmedia.wdprapps.disney.com/me...,"By submitting this form, you are granting Disn...",abc-news,ABC News
2020-02-19 20:58:04+00:00,MICHAEL LIEDTKE AP Business Writer,Judge rips PG&E for poor safety record leading...,A U.S. judge says executives for a California ...,https://abcnews.go.com/Business/wireStory/judg...,https://s.abcnews.com/images/US/WireAP_eff6339...,SAN FRANCISCO -- \r\nA U.S. judge ripped into ...,abc-news,ABC News
2020-02-19 20:19:33+00:00,MICHAEL LIEDTKE AP Business Writer,Judge rips PG&E for ghastly safety record lead...,A U.S. judge says executives for a California ...,https://abcnews.go.com/US/wireStory/judge-rips...,https://s.abcnews.com/images/US/WireAP_eff6339...,SAN FRANCISCO -- \r\nA U.S. judge ripped into ...,abc-news,ABC News
...,...,...,...,...,...,...,...,...
2020-06-04 00:03:42+00:00,Kevin Freking | AP,Ex-defense chief Mattis rips Trump for dividin...,Former defense secretary James Mattis is denou...,https://www.washingtonpost.com/world/national-...,https://www.washingtonpost.com/wp-apps/imrs.ph...,The criticism was all the more remarkable beca...,the-washington-post,The Washington Post
2020-06-12 11:04:37+00:00,Timothy Bella,Rips media...,"Performing in front of an intimate, socially d...",https://www.washingtonpost.com/nation/2020/06/...,https://www.washingtonpost.com/wp-apps/imrs.ph...,"Performing in front of an intimate, socially d...",the-washington-post,The Washington Post
2020-08-11 12:00:00+00:00,"Brady Dennis, Jeremy Duda, Joel Achenbach","With no end to the pandemic in sight, coronavi...","With no end to the pandemic in sight, coronavi...",https://www.washingtonpost.com/health/with-no-...,https://www.washingtonpost.com/wp-apps/imrs.ph...,"But Arizonas economic reopening in May, urged ...",the-washington-post,The Washington Post
2020-08-11 15:07:37+00:00,"Brady Dennis, Jeremy Duda and Joel Achenbach, ...","With no end to pandemic in sight, coronavirus ...",Gabe Rice began sheltering in his suburban Pho...,http://www.washingtonpost.com/health/with-no-e...,https://s.hdnux.com/photos/01/13/40/64/1979524...,Gabe Rice began sheltering in his suburban Pho...,the-washington-post,The Washington Post
