In [None]:
import numpy as np
from bs4 import BeautifulSoup
from urllib2 import urlopen
import pandas as pd

## Mormon Pioneer Overland Travel Database

pseudocode

- go to company list
- scrape all companies
- for each company:
    - record info (unique id, # of pioneers, departure date, arrival date)
    - go to company page
    - record more info (type, category, direction, captain)
    - then go to roster tab
    - scrape each person
    - for each person:
        - record info (name, age, birthdate, deathdate)
        - go to person page
        - scrape FamilySearch ID (if it's there)
        - scrape gender

### Example: Company List

In [None]:
baseurl = 'https://history.lds.org'

In [None]:
page = urlopen('https://history.lds.org/overlandtravel/companies')
soup = BeautifulSoup(page)

In [None]:
for row in soup.tbody.contents[1::2]:
    print row.a.text.strip()
    href = row.a.get('href')
    print href.split('/')[-2]
    print href
    print row.find(class_='sort-by-number-of-pioneers').get('data-value')
    print row.find(class_='sort-by-departure-date').get('data-value')
    print row.find(class_='sort-by-arrival-date').get('data-value') 
    print

### Example: Company page

In [None]:
page = urlopen(baseurl + '/overlandtravel/companies/402/horace-m-alexander-company')
soup = BeautifulSoup(page)

In [None]:
for e in soup.find(class_='sidebar-block').find_all('dl'):
    if e.dt.text == 'Type':
        print 'Type:', e.dd.text
    elif e.dt.text == 'Category':
        print 'Category:', e.dd.text
    elif e.dt.text == 'Direction':
        print 'Direction:', e.dd.text
    elif e.dt.text == 'Captain':
        print 'Captain:', e.dd.text

In [None]:
for e in soup.find(id='roster').tbody.contents[1::2]:
    print e.prettify()
    print 'Name:', e.find(class_='sort-by-name').text.strip()
    print 'Age:', e.find(class_='sort-by-age').text.strip()
    print 'Birthdate:', e.find(class_='sort-by-birthdate').get('data-value')
    print 'Deathdate:', e.find(class_='sort-by-deathdate').get('data-value')
    print e.a.get('href')
    print

### Example: Person page with FamilySearch ID

In [None]:
page = urlopen(baseurl + '/overlandtravel/pioneers/42724/horace-martin-alexander')
page = urlopen('https://history.lds.org/overlandtravel/pioneers/42735/newman-bulkley')
soup = BeautifulSoup(page)

In [None]:
print soup.find(class_='database-icon--container').a.get('href') 
print 'Gender:', soup.find(text='Gender').parent.parent.dd.text

### Example: Person page without FamilySearch ID

In [None]:
page = urlopen('https://history.lds.org/overlandtravel/pioneers/25436/brother-hamer')
soup = BeautifulSoup(page)

In [None]:
print soup.find(title='Find this person in FamilySearch')
print 'Gender:', soup.find(text='Gender').parent.parent.dd.text

### Example: Person Page with multiple links

In [None]:
page = urlopen('https://history.lds.org/overlandtravel/pioneers/16046/thomas-colborn')
soup = BeautifulSoup(page)

In [None]:
print soup.find(title='Find this person in FamilySearch').parent.get('href')

# Let's get scraping

In [None]:
baseurl = 'https://history.lds.org'

In [None]:
total_ppl = 0

page = urlopen(baseurl + '/overlandtravel/companies')
soup = BeautifulSoup(page)
for row in soup.tbody.contents[1::2]:
    group_size = row.find(class_='sort-by-number-of-pioneers').get('data-value')
    if group_size:
        total_ppl += int(group_size)
    
print total_ppl

In [None]:
page = urlopen(baseurl + '/overlandtravel/companies')
soup = BeautifulSoup(page)

company_data = []
people_data = []
ppl = 0

for row in soup.tbody.contents[1::2]:
    c_name = row.a.text.strip()
    c_href = row.a.get('href')
    c_id = c_href.split('/')[-2]
    group_size = row.find(class_='sort-by-number-of-pioneers').get('data-value')
    departure_date = row.find(class_='sort-by-departure-date').get('data-value')
    arrival_date = row.find(class_='sort-by-arrival-date').get('data-value') 
    
    company_soup = BeautifulSoup(urlopen(baseurl + c_href))
    print 'scraping', c_name
    for e in company_soup.find(class_='sidebar-block').find_all('dl'):
        if e.dt.text == 'Type':
            c_type = e.dd.text
        elif e.dt.text == 'Category':
            c_category = e.dd.text
        elif e.dt.text == 'Direction':
            c_direction = e.dd.text
        elif e.dt.text == 'Captain':
            c_captain = e.dd.text
    
    c_row = [c_name, group_size, departure_date, arrival_date,
             c_type, c_category, c_direction, c_captain, c_id]
    
    company_data.append(c_row)
    
    # iterate through people
    for e in company_soup.find(id='roster').tbody.contents[1::2]:
        ppl += 1
        p_name = e.find(class_='sort-by-name').text.strip()
        p_age = e.find(class_='sort-by-age').text.strip()
        p_birthdate = e.find(class_='sort-by-birthdate').get('data-value')
        p_deathdate = e.find(class_='sort-by-deathdate').get('data-value')
        p_href = e.a.get('href')
        
        # follow p_href and scrape gender, familysearch ID
        while True:
            try:
                person_soup = BeautifulSoup(urlopen(baseurl+p_href))
                break
            except:
                print 'trying', p_name, 'again'
        print '\t', 100*ppl/total_ppl, '%', 'scraping', p_name, ':', 
        
        fs = person_soup.find(title='Find this person in FamilySearch')
        p_fs_href = fs.parent.get('href') if fs else ''
        
        p_gender = person_soup.find(text='Gender').parent.parent.dd.text
        
        
        p_row = [p_name, p_age, p_birthdate, p_deathdate, p_gender, p_fs_href, c_id]
        people_data.append(p_row)
        #print p_row
    
    

In [None]:
pd.DataFrame(people_data)

In [None]:
baseurl+p_href

In [None]:
pd.DataFrame(company_data)