## Import libraries

In [1]:
import json
import requests
from bs4 import BeautifulSoup
import re
import os
import time
import csv
from collections import defaultdict
from datetime import date

## Define search parameters

In [2]:
# base URL for scrapping
root = 'https://www.presidency.ucsb.edu'

# search based on name, title, and date
# date is between 1) receives party nomination and 2) election day 
cand_dict = {1: {'first': 'George-w', 'last': 'Bush', 'title': 'President', 'start': date(2000, 8, 3), 'end': date(2000, 12, 12)},#2000
             2: {'first': 'Albert', 'last': 'Gore-jr', 'title': 'VP', 'start': date(2000, 8, 17), 'end': date(2000, 12, 12)},
             3: {'first': 'George-w', 'last': 'Bush', 'title': 'President', 'start': date(2004, 9, 2), 'end': date(2004, 11, 2)}, #2004
             4: {'first': 'John-f', 'last': 'Kerry', 'title': 'Senator', 'start': date(2004, 7, 29), 'end': date(2004, 11, 2)},
             5: {'first': 'Barack', 'last': 'Obama', 'title': 'President', 'start': date(2008, 8, 28), 'end': date(2008, 11, 3)},#2008
             6: {'first': 'John', 'last': 'McCain', 'title': 'Senator', 'start': date(2008, 9, 4), 'end': date(2008, 11, 3)}, 
             7: {'first': 'Barack', 'last': 'Obama', 'title': 'President', 'start': date(2012, 9, 6), 'end': date(2012, 11, 6)},#2012
             8: {'first': 'Mitt', 'last': 'Romney', 'title': 'Governor', 'start': date(2012, 8, 30), 'end': date(2012, 11, 6)},
             9: {'first': 'Donald-j', 'last': 'Trump', 'title': 'President', 'start': date(2016, 7, 21), 'end': date(2016, 11, 8)}, #2016
             10: {'first': 'Hillary', 'last': 'Clinton', 'title': 'Secretary', 'start': date(2016, 7, 28), 'end': date(2016, 11, 8)}}

## Scrape, write txt and meta data

In [14]:
for cand in range(1,11):  
    #create candidate specific url string
    if cand_dict[cand]['title'] == "President":
        candidate = '/people/president/' + cand_dict[cand]['first'] + '-' + cand_dict[cand]['last'] + '?page=' 
    else:
        candidate = '/people/other/' + cand_dict[cand]['first'] + '-' + cand_dict[cand]['last'] + '?page=' 
    candidate_iter = 0

    speech_path = 'speeches_' + cand_dict[cand]['last'].lower() + '_ucsb'
    try:  
        os.mkdir(speech_path)
    except OSError:
        pass
    # first go into candidate page
    r = requests.get(root + candidate.lower() + str(candidate_iter))
    soup = BeautifulSoup(r.text, 'html.parser')
    # locate the final page
    try: 
        fin = soup.find_all('a', title = 'Go to last page')
     # find max page number
        max_p = int(re.findall(r'page=([0-9]+)', str(fin[0]))[0])
    except: 
        max_p = 1


    # initialize metadata dict
    metadata = defaultdict(list)

    for iter in range(candidate_iter, max_p + 1):

        r = requests.get(root + candidate.lower() + str(iter))
        soup = BeautifulSoup(r.text, 'html.parser')
        titles_raw = soup.findAll(class_="views-field views-field-title")
        dates_raw = soup.findAll(class_="date-display-single")

        
        links = []
        for x in titles_raw:
            links.append(x.a['href'])
        titles = []
        for x in titles_raw:
            titles.append(x.a.contents[0])
        dates = []
        for x in dates_raw: 
            date_raw = re.findall(r'content="([0-9]{4})-([0-9]{2})-([0-9]{2})', str(x))
#             print(int(date_raw[0][2]))
            if int(date_raw[0][0]) < 1980:
                continue
            speechdate = date(int(date_raw[0][0]), int(date_raw[0][1]), int(date_raw[0][2]))
            dates.append(speechdate)

        assert len(links) == len(titles) == len(dates)

        lengthbefore = len(metadata)

        for i in range(len(metadata), len(metadata) + len(links)):
            metadata[i] = [links[i - lengthbefore], titles[i - lengthbefore], dates[i - lengthbefore]]
            
    ### save all speeches in csv files
#     with open('metadata_all' + str(cand_dict[cand]['last']) + str(cand_dict[cand]['start'])[2:4] + '.csv', 'w') as csvfile:  
#     # creating a csv writer object  
#         csvwriter = csv.writer(csvfile)     
#     # writing the data rows  
#         csvwriter.writerows(metadata.values()) 

### start extracting speeches that fit our parameters
    speechnum = 0

    meta = [] # also save meta of those speeches separately 
    for i in range(0, len(metadata)):
        # only keep within the pre-specified dates, and only if it comes with "remarks"
        if metadata[i][2] >= cand_dict[cand]['start'] and metadata[i][2] <= cand_dict[cand]['end'] and ("remarks" in metadata[i][1].lower() or ("address" in metadata[i][1].lower() and "nomination" in metadata[i][1].lower())): 
            speechnum += 1
            r = requests.get(root + metadata[i][0])
            soup = BeautifulSoup(r.text, 'html.parser')
            text_full = soup.findAll(class_='field-docs-content')[0].get_text()
            text = text_full
            try:
                location = soup.findAll(class_="field-spot-state")[0].get_text()
                metadata[i].append(location)
            except:
                continue
            try:
                filename = cand_dict[cand]['last'].lower() + '-speech-' + str(metadata[i][2]) + '-' + re.findall('[\w]+-[\w]+-[\w]+$', metadata[i][0])[0]
                print(filename)
            except:  
                continue
            with open(os.getcwd() + '/' + speech_path + '/' + filename + '.txt', 'w') as text_file:
                text_file.write(text)
            meta.append(metadata[i])

            
        
    with open("META" + str(cand_dict[cand]['last']) + str(cand_dict[cand]['start'])[2:4] + '.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile)   
        #write header
        #csvwriter.wrtie
    # writing the data rows  
        csvwriter.writerows(meta) 

    


bush-speech-2000-08-03-national-convention-0
gore-jr-speech-2000-08-17-national-convention-los
gore-jr-speech-2000-09-29-chevy-chase-maryland
gore-jr-speech-2000-10-30-remarks-muskegon-michigan
gore-jr-speech-2000-11-01-remarks-kissimmee-florida
bush-speech-2004-09-02-convention-new-york
bush-speech-2004-09-03-west-allis-wisconsin
bush-speech-2004-09-03-remarks-moosic-pennsylvania
bush-speech-2004-09-03-rapids-iowa-1
bush-speech-2004-09-04-broadview-heights-ohio
bush-speech-2004-09-04-remarks-erie-pennsylvania
bush-speech-2004-09-04-remarks-kirtland-ohio
bush-speech-2004-09-05-parkersburg-west-virginia
bush-speech-2004-09-06-poplar-bluff-missouri
bush-speech-2004-09-07-remarks-columbia-missouri
bush-speech-2004-09-07-discussion-sedalia-missouri
bush-speech-2004-09-07-lees-summit-missouri
bush-speech-2004-09-08-remarks-intelligence-reform
bush-speech-2004-09-08-center-miami-florida
bush-speech-2004-09-09-remarks-johnstown-pennsylvania
bush-speech-2004-09-09-remarks-colmar-pennsylvania
b

obama-speech-2008-09-17-remarks-elko-nevada
obama-speech-2008-09-18-espanola-new-mexico
obama-speech-2008-09-19-coral-gables-florida
obama-speech-2008-09-20-beach-florida-1
obama-speech-2008-09-21-north-carolina-0
obama-speech-2008-09-22-green-bay-wisconsin
obama-speech-2008-09-23-tampa-florida-4
obama-speech-2008-09-23-president-ahmadinejads-remarks
obama-speech-2008-09-24-remarks-dunedin-florida
obama-speech-2008-09-25-new-york-city
obama-speech-2008-09-27-north-carolina-1
obama-speech-2008-09-28-remarks-detroit-michigan
obama-speech-2008-09-29-remarks-westminster-colorado
obama-speech-2008-09-30-reno-nevada-1
obama-speech-2008-10-01-crosse-wisconsin-1
obama-speech-2008-10-02-rapids-michigan-2
obama-speech-2008-10-03-remarks-abington-pennsylvania
obama-speech-2008-10-04-newport-news-virginia
obama-speech-2008-10-05-north-carolina-1
obama-speech-2008-10-08-indianapolis-indiana-5
obama-speech-2008-10-09-dayton-ohio-2
obama-speech-2008-10-10-chillicothe-ohio-1
obama-speech-2008-10-11-ph

romney-speech-2012-09-25-event-vandalia-ohio
romney-speech-2012-09-26-event-westerville-ohio
romney-speech-2012-09-26-event-toledo-ohio
romney-speech-2012-09-27-event-springfield-virginia
romney-speech-2012-09-28-event-wayne-pennsylvania
romney-speech-2012-10-01-denver-colorado-0
romney-speech-2012-10-04-committee-conference-denver
romney-speech-2012-10-04-event-fishersville-virginia
romney-speech-2012-10-05-event-abingdon-virginia
romney-speech-2012-10-05-st-petersburg-florida
romney-speech-2012-10-06-event-apopka-florida
romney-speech-2012-10-07-st-lucie-florida
romney-speech-2012-10-08-virginia-the-mantle
romney-speech-2012-10-08-newport-news-virginia
romney-speech-2012-10-09-van-meter-iowa
romney-speech-2012-10-09-event-cuyahoga-falls
romney-speech-2012-10-10-mount-vernon-ohio
romney-speech-2012-10-10-event-delaware-ohio
romney-speech-2012-10-10-event-sidney-ohio
romney-speech-2012-10-11-asheville-north-carolina
romney-speech-2012-10-12-event-richmond-virginia
romney-speech-2012-10

NameError: name 'directory' is not defined