In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import re
import requests
import html
from bs4 import BeautifulSoup
import time

import spacy
import en_core_web_sm
nlp = en_core_web_sm.load()

from nltk.sentiment.vader import SentimentIntensityAnalyzer as sia



# Scraping the data

We will collect the headlines by parsing the HTML from Politico's search results, which can be directly manipulated through the URL. We want to extract the headlines from the 66 weeks starting on August 3, 2015 (the week of the first presidential debate) and ending on November 7, 2016 (the day before the election). First, we create a function to generate the start and end dates of each week.

In [2]:
def generate_dates(w):
    '''
    w is the number of weeks after August 3, 2015, which was the week of the first presidential debate.
    For example, w = 1 returns the tuple ('20150803', '20150810').
    '''
    return (str(pd.Timestamp('20150803') + (w-1)*pd.Timedelta('7 days'))[:10].replace('-', ''),
            str(pd.Timestamp('20150803') + w*pd.Timedelta('7 days'))[:10].replace('-', ''))

To verify that our function works as intended, we use it to generate the dates for week 66, which should end on November 7, 2016.

In [3]:
generate_dates(66)

('20161031', '20161107')

Next, we must compile the headlines for a given date range. The function below breaks up the strings returned from the date tuple generated by the previous function and inserts them into the URL. (It can also accept any other date range given as integers in YYYYMMDD format.) It then loops through every page of the search results, finding the headlines and converting them from HTML to raw text. Since most headlines do not include ending punctuation, periods are added so that we can more easily parse them as sentences later. Every headline in the given date range is concatenated and returned as a string.

In [4]:
def get_headlines(start_date, end_date):
    '''
    Enter start and end dates in YYYYMMDD format.
    '''
    start_yr = str(start_date)[:4]
    start_mo = str(start_date)[4:6]
    start_day = str(start_date)[6:]
    end_yr = str(end_date)[:4]
    end_mo = str(end_date)[4:6]
    end_day = str(end_date)[6:]
    bag = ''
    for i in range(1, 20000):
        u = requests.get(r'http://www.politico.com/search/{}?s=oldest&adv=true&start={}%2F{}%2F{}&end={}%2F{}%2F{}&c=0000014b-324d-d4f3-a3cb-f3ff415e0035'
                         .format(i, start_mo, start_day, start_yr, end_mo, end_day, end_yr))
        soup = BeautifulSoup(u.text, "lxml")
        headlines = soup.find_all('h3')[:-10]  # finds HTML tags for headlines; omits the last ten, which are frequently searched terms
        if len(headlines) == 0:  # once we go beyond the last page of search results, we end the loop
            bag = bag.replace(chr(39), chr(8217)).strip()  # removes any whitespace at the end
            return bag
        for j in range(len(headlines)):
            headline = html.unescape(re.split(r'[><]', str(headlines[j]))[4].strip())  # extracts the text from the HTML
            if headline[-1] in '?.!':  # if the headline does have ending punctuation, we don't add a period
                bag += headline + ' '
            else:
                bag += headline + '. '

To demonstrate the output, we scrape the headlines for November 7, the day before the election.

In [5]:
get_headlines(20161107, 20161107)

'Dems losing voting challenges but winning on optics. Clinton looks poised to lock it up. Bloomberg’s 2016 tally: $65 million and counting. A guide to the 2016 New York elections. How the latest rules from CMS may be sending mixed signals. The great Queens health care overhaul. Real estate eyes Rikers. Florida Higher Ed Watch: FGCU to narrow presidential candidates; New College board chair on growth plan. Bondi’s press office silent on why she ended ’Hot Topics’ talking points memo. Former U.S. attorney general Janet Reno dies. In campaign’s twilight, Trump basks in the glow of his rallies. Baker counts the hours as a memorable campaign wraps up. Young Dem seeks Essex seat as incumbent cites need for ’balance’. Opposition to Question 3: A campaign that never was. Brown presses hard to stop Prop 53 as supporter cries foul. City’s charter school sector has clout but no candidate for 2017. New poll: Clinton leads Trump by 3 nationwide. Exit polls under siege. In first interview after Brid

We will create a list to keep track of the headlines for each week. In this list, each entry is the string of all headlines from that week. To keep the indexing consistent, the first entry will be a null object.

In [6]:
headline_list = [None]

The dictionary will be updated with the following function.

In [7]:
def gather_headline_strings(headline_list, w):
    headline_list.append(get_headlines(generate_dates(w)[0], generate_dates(w)[1]))

The last step is to save the headlines as text files. Doing so will allow us to read the headlines from our local environment instead of having to scrape them each time we run our code. This saves a tremendous amount of time and resources (it initially took us almost 30 minutes to scrape all of the headlines).

In [8]:
def save_headlines(w):
    try:
        f = open('processed/week{}.txt'.format(w), 'w+', encoding='utf-8')
        try:
            f.write(headline_list[w])  # writes string to text file
        finally:
            f.close()
    except IOError:
        pass

In [9]:
def gather_and_save(w):
    for i in range(1, w+1):
        gather_headline_strings(headline_list, i)
        save_headlines(i)
        time.sleep(10)
        # this forces a ten-second delay between each week
        # too many requests in a short period of time can cause the server to return an error

The text files have been included in the "data" folder of this repository. If you wish to run the scraper and generate the text files yourself, uncomment and run the cell below. The files will be created in the "processed" directory. **Be careful: this will overwrite any files with the same names in the folder. The process may take around 30 minutes, depending on your hardware.**

In [10]:
#gather_and_save(66)