In [1]:
import pandas as pd
import numpy as np
from tqdm import trange

import sys
import glob
import os

from bokeh.core.properties import value
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, HoverTool, DataRange1d, Legend
from bokeh.palettes import d3

output_notebook()

In [2]:
def headlines(search_directories = [''], filename_filter = "*"):

    # Get a list of matching files
    files = []

    for i in range(len(search_directories)):
        files.append(sorted(glob.glob(os.path.join(search_directories[i], filename_filter))))

    files = [item for items in files for item in items] # shorthand to flatten the list of lists

    # Read in files and concatenate
    data = []

    for i in trange(len(files)):
        new_frame = pd.read_csv(files[i], header=0, parse_dates=['publishedAt'])
        new_frame.columns = new_frame.columns.str.replace(".", "_")
        data.append(new_frame)

    return pd.concat(data, sort=False)

In [3]:
data = headlines(['../datastore/processed/*/'], '*-2020-*-*')
data = data.set_index('publishedAt', drop=True)
data = data.drop_duplicates()

100%|██████████| 1696/1696 [00:07<00:00, 227.88it/s]


In [4]:
len(data)

262498

In [5]:
def subset_by_text(_data, includes=None, excludes=None, search_field="title"):
    """Count the number of occurrences of text in a dataframe column.
    
    Arguments:
        includes (list): a list of strings for which to search. Each string is searched
        via regex and the result is subsetted in the order they are provided.
        
        excludes (list): a list of strings to exclude. These are executed after the includes
        list, in the order they are provided, and are exclusive. E.g., to keep occurrences
        that have both a word in the includes and in the excludes, you need to devise a
        a single regex-style string and provide it via the includes list.
        
        sources (list): list of elements from source_id to include.
        
        frequency (string): passed to pandas.DataFrame.resample
        
        search_field (string): the column name to search
        
    Returns:
        (pd.DataFrame): a new dataframe containing the counts of results from each source.
    """
    if includes is not None:
        for include_word in includes:
            result = _data[_data[search_field].str.contains(include_word, case=False, na=False)].copy()
        description_text = "Result includes " + ", ".join('"' + include + '"' for include in includes)
        
    else:
        return

    if excludes is not None:
        for exclude_word in excludes:
            result = result[~result[search_field].str.contains(exclude_word, case=False, na=False)]
        
        description_text = description_text + " and excludes " + ", ".join('"' + exclude + '"' for exclude in excludes)

    result.desc = description_text
            
    return result

In [6]:
def count_by_source(_data, freq="D"):
    """Count the number of rows attribtued to each source_name."""
    
    grouper = _data.groupby([pd.Grouper(freq=freq), 'source_name'])
    time_group = grouper['title'].count().unstack('source_name').fillna(0)
    
    time_group.desc = _data.desc
    time_group.freq = freq
    
    return time_group

In [21]:
def bokeh_bar(_counts):
    
    bar_width_config = {
        "M":2000000000,
        "W":500000000,
        "D":50000000
    }
    
    print_time_config = {
        "M":"%B %Y",
        "W":"Week of %B %d, %Y",
        "D":"%B %d, %Y"
    }
    
    to_plot = _counts.copy()
    to_plot['showtime'] = to_plot.index.strftime(print_time_config[_counts.freq])

    source = ColumnDataSource(to_plot)

    categories = list(_counts.keys())
    colors = d3['Category10'][len(categories)]

    tooltips = [
        ('Period', '@showtime'),
        ('Source', '$name'),
        ('Count', '@$name')
    ]
    tools = "xpan, xwheel_zoom, reset"

    p = figure(plot_width=900, plot_height=500, x_axis_type='datetime',
               tooltips = tooltips, tools=tools, y_range=DataRange1d(start=0),
               title=counts.desc)



    v = p.vbar_stack(categories, x='publishedAt', color=colors,
                 width=bar_width_config[_counts.freq], source=source)

    legend = Legend(items=[(x, [v[i]]) for i, x in enumerate(categories)], location=(5, 100))
    p.add_layout(legend)
    p.legend.location = "top_left"
    p.legend.border_line_color = None
    
    show(p)

In [28]:
subset = subset_by_text(data, includes=["virus|pandemic"])
counts = count_by_source(subset, freq="D")
bokeh_bar(counts)

In [18]:
subset = subset_by_text(data, includes=["BLM|Black Lives Matter"])
counts = count_by_source(subset, freq="D")
bokeh_bar(counts)

In [19]:
subset = subset_by_text(data, includes=["Kamala|Harris"], excludes=["Harrison", "Harris Falkner", "Colton Harris"])
counts = count_by_source(subset, freq="D")
bokeh_bar(counts)

In [10]:
def print_headlines(_df):
    print(len(_df), "headlines")
    print("=============")
    for i in range(len(_df)):
        print(i, '-', _df.iloc[i].name.strftime("%b %d, %Y"), "-", _df.iloc[i].title)

In [11]:
print_headlines(subset[subset["source_id"] == "the-new-york-times"])

43 headlines
0 - May 31, 2020 - Corporate Voices Get Behind ‘Black Lives Matter’ Cause
1 - Jun 08, 2020 - BTS Fans Say They’ve Raised $1 Million for Black Lives Matter Groups
2 - Jun 09, 2020 - Poets Criticize Poetry Foundation’s Statement on Black Lives Matter
3 - Jun 09, 2020 - Dancing Bodies That Proclaim: Black Lives Matter
4 - Jun 10, 2020 - ‘Da 5 Bloods’ Review: Black Lives Mattered in Vietnam, Too
5 - Jun 10, 2020 - Premier League Captains Plan Show of Support for Black Lives Matter
6 - Jun 10, 2020 - Economics, Dominated by White Men, Is Roiled by Black Lives Matter
7 - Jun 12, 2020 - Starbucks Will Allow Employees to Wear Black Lives Matter Apparel
8 - Jun 13, 2020 - 5 Podcasts at the Intersection of Pride Month and the Black Lives Matter Movement
9 - Jun 13, 2020 - Cory Booker on Newark Pride, Black Lives Matter and ‘This Distraught Present’
10 - Jun 13, 2020 - How Black Lives Matter Reached Every Corner of America
11 - Jun 14, 2020 - Black Lives Matter, Reopening, Pride Mont

In [12]:
print_headlines(subset[subset["source_id"] == "fox-news"])

199 headlines
0 - Feb 12, 2020 - Black Lives Matter leader justifies rioting in interview with Tomi Lahren: 'Riot is the language of the unheard'
1 - Mar 02, 2020 - Los Angeles DA apologizes after husband points gun at Black Lives Matter protesters
2 - Jun 02, 2020 - Evangelical leaders support Black Lives Matter
3 - Jun 02, 2020 - Beauty blogger apologizes for blackface makeup to 'support' Black Lives Matter: report
4 - Jun 03, 2020 - Black Lives Matter plans armed 'peace officers' to deter police brutality, NY leader says
5 - Jun 04, 2020 - David Webb blasts Hollywood, Black Lives Matter's calls to defund police departments
6 - Jun 04, 2020 - Tomi Lahren spoke to Black Lives Matter leader calling for armed patrols to counter police brutality
7 - Jun 04, 2020 - Colts GM Chris Ballard asks why it's 'so freaking hard' for white people to say 'black lives matter'
8 - Jun 04, 2020 - Florida waitress writes uplifting 'Black Lives Matter' note on couple's receipt, pays bill
9 - Jun 05, 2020