## Import libraries

In [1]:
import json
import requests
from bs4 import BeautifulSoup
import re
import os
import time
import csv
from collections import defaultdict
from datetime import date

## Define search parameters

In [35]:
# base URL for scrapping
root = 'https://www.presidency.ucsb.edu'

# search based on name, title, and date
# date is between 1) receives party nomination and 2) election day 
# cand_dict = {1: {'first': 'George-w', 'last': 'Bush', 'title': 'President', 'start': date(2000, 8, 3), 'end': date(2000, 12, 12)},#2000
#              2: {'first': 'Albert', 'last': 'Gore-jr', 'title': 'VP', 'start': date(2000, 8, 17), 'end': date(2000, 12, 12)},
#              3: {'first': 'George-w', 'last': 'Bush', 'title': 'President', 'start': date(2004, 9, 2), 'end': date(2004, 11, 2)}, #2004
#              4: {'first': 'John-f', 'last': 'Kerry', 'title': 'Senator', 'start': date(2004, 7, 29), 'end': date(2004, 11, 2)},
#              5: {'first': 'Barack', 'last': 'Obama', 'title': 'President', 'start': date(2008, 8, 28), 'end': date(2008, 11, 3)},#2008
#              6: {'first': 'John', 'last': 'McCain', 'title': 'Senator', 'start': date(2008, 9, 4), 'end': date(2008, 11, 3)}, 
#              7: {'first': 'Barack', 'last': 'Obama', 'title': 'President', 'start': date(2012, 9, 6), 'end': date(2012, 11, 6)},#2012
#              8: {'first': 'Mitt', 'last': 'Romney', 'title': 'Governor', 'start': date(2012, 8, 30), 'end': date(2012, 11, 6)},
#              9: {'first': 'Donald-j', 'last': 'Trump', 'title': 'President', 'start': date(2016, 7, 21), 'end': date(2016, 11, 8)}, #2016
#              10: {'first': 'Hillary', 'last': 'Clinton', 'title': 'Secretary', 'start': date(2016, 7, 28), 'end': date(2016, 11, 8)}}
cand_dict = {1: {'first': 'Donald-j', 'last': 'Trump', 'title': 'President', 'start': date(2020, 3, 17), 'end': date(2020, 11, 3)},#2020
              2: {'first': 'Joseph-r', 'last': 'Biden', 'title': 'President', 'start': date(2020, 8, 18), 'end': date(2020, 11, 3)},}

## Scrape, write txt and meta data

In [39]:
for cand in range(1,3):  
    #create candidate specific url string
    if cand_dict[cand]['title'] == "President":
        candidate = '/people/president/' + cand_dict[cand]['first'] + '-' + cand_dict[cand]['last'] + '?page=' 
    else:
        candidate = '/people/other/' + cand_dict[cand]['first'] + '-' + cand_dict[cand]['last'] + '?page=' 
    candidate_iter = 0

    speech_path = 'speeches_' + cand_dict[cand]['last'].lower() + '_ucsb'
    try:  
        os.mkdir(speech_path)
    except OSError:
        pass
    # first go into candidate page
    r = requests.get(root + candidate.lower() + str(candidate_iter))
    soup = BeautifulSoup(r.text, 'html.parser')
    # locate the final page
    try: 
        fin = soup.find_all('a', title = 'Go to last page')
     # find max page number
        max_p = int(re.findall(r'page=([0-9]+)', str(fin[0]))[0])
    except: 
        max_p = 1


    # initialize metadata dict
    metadata = defaultdict(list)

    for iter in range(candidate_iter, max_p + 1):

        r = requests.get(root + candidate.lower() + str(iter))
        soup = BeautifulSoup(r.text, 'html.parser')
        titles_raw = soup.findAll(class_="views-field views-field-title")
        dates_raw = soup.findAll(class_="field-docs-start-date-time")
        
        links = []
        for x in titles_raw:
            links.append(x.a['href'])
        titles = []
        for x in titles_raw:
            titles.append(x.a.contents[0])
        dates = []
        for x in dates_raw: 
            date_raw = re.findall(r'content="([0-9]{4})-([0-9]{2})-([0-9]{2})', str(x))
            if int(date_raw[0][0]) < 1980: #or int(date_raw[0][0]) > 2020:
                continue
            speechdate = date(int(date_raw[0][0]), int(date_raw[0][1]), int(date_raw[0][2]))
            dates.append(speechdate)

        assert len(links) == len(titles) == len(dates)

        lengthbefore = len(metadata)

        for i in range(len(metadata), len(metadata) + len(links)):
            metadata[i] = [links[i - lengthbefore], titles[i - lengthbefore], dates[i - lengthbefore]]

### start extracting speeches that fit our parameters
    speechnum = 0

    meta = [] # also save meta of those speeches separately 
    for i in range(0, len(metadata)):
        # only keep within the pre-specified dates, and only if it comes with "remarks"
        if metadata[i][2] >= cand_dict[cand]['start'] and metadata[i][2] <= cand_dict[cand]['end'] and ("remarks" in metadata[i][1].lower() or ("address" in metadata[i][1].lower() and "nomination" in metadata[i][1].lower())): 
            speechnum += 1
            r = requests.get(root + metadata[i][0])
            soup = BeautifulSoup(r.text, 'html.parser')
            text_full = soup.findAll(class_='field-docs-content')[0].get_text()
            text = text_full
            try:
                location = soup.findAll(class_="field-spot-state")[0].get_text()
                metadata[i].append(location)
            except:
                continue
            try:
                filename = cand_dict[cand]['last'].lower() + '-speech-' + str(metadata[i][2]) + '-' + re.findall('[\w]+-[\w]+-[\w]+$', metadata[i][0])[0]
                print(filename)
            except:  
                continue
            with open(os.getcwd() + '/' + speech_path + '/' + filename + '.txt', 'w') as text_file:
                text_file.write(text)
            meta.append(metadata[i])

            
        
    with open("META" + str(cand_dict[cand]['last']) + str(cand_dict[cand]['start'])[2:4] + '.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile)   
        #write header
        #csvwriter.wrtie
    # writing the data rows  
        csvwriter.writerows(meta) 

    


trump-speech-2020-03-17-press-briefing-3
trump-speech-2020-03-17-federal-governments-coronavirus
trump-speech-2020-03-18-press-briefing-4
trump-speech-2020-03-18-governments-response-the
trump-speech-2020-03-19-press-briefing-5
trump-speech-2020-03-19-preparedness-and-response
trump-speech-2020-03-20-press-briefing-6
trump-speech-2020-03-21-press-briefing-7
trump-speech-2020-03-22-press-briefing-8
trump-speech-2020-03-23-press-briefing-9
trump-speech-2020-03-24-the-coronavirus-pandemic
trump-speech-2020-03-24-press-briefing-10
trump-speech-2020-03-25-press-briefing-11
trump-speech-2020-03-26-press-briefing-12
trump-speech-2020-03-27-and-exchange-with
trump-speech-2020-03-27-press-briefing-13
trump-speech-2020-03-28-for-norfolk-virginia
trump-speech-2020-03-28-andrews-maryland-5
trump-speech-2020-03-28-station-norfolk-virginia
trump-speech-2020-03-29-federal-governments-coronavirus
trump-speech-2020-03-29-press-briefing-14
trump-speech-2020-03-30-press-briefing-15
trump-speech-2020-03-3

trump-speech-2020-08-29-with-reporters-lake
trump-speech-2020-08-29-with-reporters-orange
trump-speech-2020-09-01-andrews-maryland-3
trump-speech-2020-09-01-damage-kenosha-wisconsin
trump-speech-2020-09-01-tour-kenosha-wisconsin
trump-speech-2020-09-01-with-reporters-kenosha
trump-speech-2020-09-02-city-wilmington-north
trump-speech-2020-09-03-rally-latrobe-pennsylvania
trump-speech-2020-09-03-andrews-maryland-0
trump-speech-2020-09-04-minister-avdullah-hoti
trump-speech-2020-09-08-salem-north-carolina
trump-speech-2020-09-08-andrews-maryland-4
trump-speech-2020-09-08-states-outer-continental
trump-speech-2020-09-09-exchange-with-reporters
trump-speech-2020-09-10-rally-freeland-michigan
trump-speech-2020-09-10-andrews-maryland-5
trump-speech-2020-09-11-september-11-2001
trump-speech-2020-09-11-the-kingdom-bahrain
trump-speech-2020-09-11-t-patrick-payne
trump-speech-2020-09-12-rally-minden-nevada
trump-speech-2020-09-12-arrival-reno-nevada
trump-speech-2020-09-13-rally-henderson-nevada


In [37]:
dates_raw

[<span class="date-display-single" content="2020-12-23T00:00:00+00:00" datatype="xsd:dateTime" property="dc:date">December 23, 2020</span>,
 <span class="date-display-single" content="2020-12-23T00:00:00+00:00" datatype="xsd:dateTime" property="dc:date">December 23, 2020</span>,
 <span class="date-display-single" content="2020-12-23T00:00:00+00:00" datatype="xsd:dateTime" property="dc:date">December 23, 2020</span>,
 <span class="date-display-single" content="2020-12-23T00:00:00+00:00" datatype="xsd:dateTime" property="dc:date">December 23, 2020</span>,
 <span class="date-display-single" content="2020-12-23T00:00:00+00:00" datatype="xsd:dateTime" property="dc:date">December 23, 2020</span>,
 <span class="date-display-single" content="2020-12-24T00:00:00+00:00" datatype="xsd:dateTime" property="dc:date">December 24, 2020</span>,
 <span class="date-display-single" content="2020-12-25T00:00:00+00:00" datatype="xsd:dateTime" property="dc:date">December 25, 2020</span>,
 <span class="date-d

In [34]:
len(links)

35