## Import libraries

In [128]:
import json
import requests
from bs4 import BeautifulSoup
import re
import os
import time
import csv
from collections import defaultdict
from datetime import date

## Define search parameters

In [None]:
# base URL for scrapping
root = 'https://www.presidency.ucsb.edu'

# search based on name, title, and date
# date is between 1) receives party nomination and 2) election day 
cand_dict = {1: {'first': 'George-w', 'last': 'Bush', 'title': 'President', 'start': date(2000, 8, 3), 'end': date(2000, 12, 12)},#2000
             2: {'first': 'Albert', 'last': 'Gore-jr', 'title': 'VP', 'start': date(2000, 8, 17), 'end': date(2000, 12, 12)},
             3: {'first': 'George-w', 'last': 'Bush', 'title': 'President', 'start': date(2004, 9, 2), 'end': date(2004, 11, 2)}, #2004
             4: {'first': 'John-f', 'last': 'Kerry', 'title': 'Senator', 'start': date(2004, 7, 29), 'end': date(2004, 11, 2)},
             5: {'first': 'Barack', 'last': 'Obama', 'title': 'President', 'start': date(2008, 8, 28), 'end': date(2008, 11, 3)},#2008
             6: {'first': 'John', 'last': 'McCain', 'title': 'Senator', 'start': date(2008, 9, 4), 'end': date(2008, 11, 3)}, 
             7: {'first': 'Barack', 'last': 'Obama', 'title': 'President', 'start': date(2012, 9, 6), 'end': date(2012, 11, 6)},#2012
             8: {'first': 'Mitt', 'last': 'Romney', 'title': 'Governor', 'start': date(2012, 8, 30), 'end': date(2012, 11, 6)},
             9: {'first': 'Donald-j', 'last': 'Trump', 'title': 'President', 'start': date(2016, 7, 21), 'end': date(2016, 11, 8)}, #2016
             10: {'first': 'Hillary', 'last': 'Clinton', 'title': 'Secretary', 'start': date(2016, 7, 28), 'end': date(2016, 11, 8)}}

## Scrape, write txt and meta data

In [155]:
for cand in range(1,11):  
    #create candidate specific url string
    if cand_dict[cand]['title'] == "President":
        candidate = '/people/president/' + cand_dict[cand]['first'] + '-' + cand_dict[cand]['last'] + '?page=' 
    else:
        candidate = '/people/other/' + cand_dict[cand]['first'] + '-' + cand_dict[cand]['last'] + '?page=' 
    candidate_iter = 0

    speech_path = 'speeches_' + cand_dict[cand]['last'].lower() + '_ucsb'
    try:  
        os.mkdir(speech_path)
    except OSError:
        pass
    # first go into candidate page
    r = requests.get(root + candidate.lower() + str(candidate_iter))
    soup = BeautifulSoup(r.text, 'html.parser')
    # locate the final page
    try: 
        fin = soup.find_all('a', title = 'Go to last page')
     # find max page number
        max_p = int(re.findall(r'page=([0-9]+)', str(fin[0]))[0])
    except: 
        max_p = 1


    # initialize metadata dict
    metadata = defaultdict(list)

    for iter in range(candidate_iter, max_p + 1):

        r = requests.get(root + candidate.lower() + str(iter))
        soup = BeautifulSoup(r.text, 'html.parser')
        titles_raw = soup.findAll(class_="views-field views-field-title")
        dates_raw = soup.findAll(class_="date-display-single")

        
        links = []
        for x in titles_raw:
            links.append(x.a['href'])
        titles = []
        for x in titles_raw:
            titles.append(x.a.contents[0])
        dates = []
        for x in dates_raw: 
            date_raw = re.findall(r'content="([0-9]{4})-([0-9]{2})-([0-9]{2})', str(x))
#             print(int(date_raw[0][2]))
            if int(date_raw[0][0]) < 1980:
                continue
            speechdate = date(int(date_raw[0][0]), int(date_raw[0][1]), int(date_raw[0][2]))
            dates.append(speechdate)

        assert len(links) == len(titles) == len(dates)

        lengthbefore = len(metadata)

        for i in range(len(metadata), len(metadata) + len(links)):
            metadata[i] = [links[i - lengthbefore], titles[i - lengthbefore], dates[i - lengthbefore]]
            
    ### save all speeches in csv files
#     with open('metadata_all' + str(cand_dict[cand]['last']) + str(cand_dict[cand]['start'])[2:4] + '.csv', 'w') as csvfile:  
#     # creating a csv writer object  
#         csvwriter = csv.writer(csvfile)     
#     # writing the data rows  
#         csvwriter.writerows(metadata.values()) 

### start extracting speeches that fit our parameters
    speechnum = 0
    
    meta = [] # also save meta of those speeches separately 
    for i in range(0, len(metadata)):
        # only keep within the pre-specified dates, and only if it comes with "remarks"
        if metadata[i][2] >= cand_dict[cand]['start'] and metadata[i][2] <= cand_dict[cand]['end'] and any(re.match(regex, metadata[i][1]) for regex in ['[Rr]emarks','[Aa]ddress']):
            speechnum += 1
            meta.append(metadata[i])

            r = requests.get(root + metadata[i][0])
            soup = BeautifulSoup(r.text, 'html.parser')
            text_full = soup.findAll(class_='field-docs-content')[0].get_text()
            text = text_full
            try:
                filename = cand_dict[cand]['last'].lower() + '-speech-' + str(metadata[i][2]) + '-' + re.findall('[\w]+-[\w]+-[\w]+$', metadata[i][0])[0]
                print(filename)
            except:  
                continue
            with open(os.getcwd() + '/' + speech_path + '/' + filename + '.txt', 'w') as text_file:
                text_file.write(text)
        
    with open("META" + str(cand_dict[cand]['last']) + str(cand_dict[cand]['start'])[2:4] + '.csv', 'w') as csvfile:
        csvwriter = csv.writer(csvfile)   
        #write header
        csvwriter.wrtie
    # writing the data rows  
        csvwriter.writerows(meta) 

    


In [147]:
with open('metadata_allBush.csv') as f:
    reader = csv.reader(f)
    bush = list(reader)
    
with open('metadata_allGore-jr00.csv') as f:
    reader = csv.reader(f)
    gore = list(reader)    
    
with open('metadata_allObama.csv') as f:
    reader = csv.reader(f)
    obama = list(reader) 
    
with open('metadata_allKerry04.csv') as f:
    reader = csv.reader(f)
    kerry = list(reader) 
    
with open('metadata_allMcCain08.csv') as f:
    reader = csv.reader(f)
    mccain = list(reader) 

with open('metadata_allRomney12.csv') as f:
    reader = csv.reader(f)
    romney = list(reader) 
    
with open('metadata_allTrump16.csv') as f:
    reader = csv.reader(f)
    trump = list(reader)

with open('metadata_allClinton16.csv') as f:
    reader = csv.reader(f)
    clinton = list(reader)
    
meta_all = bush + gore+ obama + kerry + mccain + romney + trump + clinton

with open('metadata_all.csv', 'w') as csvfile:  
# creating a csv writer object  
    csvwriter = csv.writer(csvfile)     
# writing the data rows  
    csvwriter.writerows(meta_all) 


In [145]:
for cand in range(1,11):  
    

'00'

In [104]:
#re.findall('[Rr]emarks'|'[Aa]ddress', metadata[0][1])
#any (regex.match(metadata[0][1]) for regex in ['[Rr]emarks','[Aa]ddress'])
any(re.match(regex, metadata[0][1]) for regex in ['[Rr]emarks','[Aa]ddress'])


True

In [154]:
metadata[0]

['/documents/remarks-announcing-candidacy-for-the-republican-presidential-nomination-1',
 'Remarks Announcing Candidacy for the Republican Presidential Nomination',
 datetime.date(1999, 6, 12)]

In [152]:
metadata.values()



In [83]:
len(titles)

17

In [114]:
cand_dict[3]['last'].lower() + '-speech-' + str(metadata[14908][2]) + '-' + re.findall('[\w]+-[\w]+-[\w]+$', metadata[14908][0])[0]


'bush-speech-2016-02-16-charleston-south-carolina'

In [113]:
metadata[14908]

['/documents/remarks-campaign-rally-for-jeb-bush-charleston-south-carolina',
 'Remarks at a Campaign Rally for Jeb Bush in Charleston, South Carolina',
 datetime.date(2016, 2, 16)]

In [138]:
metadata.values()



In [135]:
x = csv.reader("metadata_all1.csv")

In [136]:
x


<_csv.reader at 0x143856e50>