In [328]:
from bs4 import BeautifulSoup
import numpy as np
from string import printable, ascii_letters
import requests
import csv
import time
from glob import glob

## Helper Functions

In [412]:
BASE_URL = 'http://bahiker.com'

AREAS = {
    'northbayhikes': 'North Bay',
    'southbayhikes': 'South Bay',
    'sfhikes': 'San Francisco',
    'eastbayhikes': 'East Bay',
}

DEPARTMENTS = {
    'CCWD':'Contra Costa Water District',
    'CSMP':'County of San Mateo Parks Department ',
    'CSP':'California State Parks',
    'EBRPD':'East Bay Regional Park District',
    'GGNRA':'Golden Gate National Recreation Area',
    'MCOSD':'Marin County Open Space District',
    'MROSD':'Midpeninsula Regional Open Space District',
    'MMWD':'Marin Municipal Water District',
    'NPS':'National Park Service',
    'SCCP':'Santa Clara County Parks',
    'SLT':'Solano Land Trust',
}

HIKE_INFO_PATHS = {
    'title': '/title.html',
    'location': '/location.html',
    'sidebar': '/sidebar.html',
}


def clean_text(instr):
    acceptable_letters = ascii_letters + ' '
    return ''.join([x if x in acceptable_letters else '' for x in instr])


def get_county(inelem):
    if type(inelem) == str:
        if 'county' in [e.lower() for e in inelem.split()]:
            return inelem.strip()
    elif type(inelem) == list:
        for elem in inelem:
            if get_county(elem):
                return elem.strip()


def get_department(inelem):
    if type(inelem) == str:
        department = DEPARTMENTS.get(inelem.strip())
        if department:
            return department.strip()            
    elif type(inelem) == list:
        for elem in inelem:
            department = get_department(elem)
            if department:
                return department

## Process Raw
### All Trails

In [241]:
soup = ''
with open('raw/trails.html', 'rb') as trails:
    soup = BeautifulSoup(trails, 'lxml')

In [242]:
all_trails = []
failures = []
for trail in soup.findAll('a'):
    # Ignore homepage links
    if trail.attrs['href'] != '/':
        try:
            extra_info = [clean_text(e) for e in trail.next.next.split(',')][1:]
        except TypeError as t:
            failures.append(t)
        all_trails.append({
                'uri': trail.attrs['href'],
                'name': clean_text(trail.text),
                'area': trail.attrs['href'][:trail.attrs['href'].find('/')],
                'county': get_county(extra_info),
                'park_department': get_department(extra_info),
            })

In [243]:
all_trails

[{'area': 'northbayhikes',
  'county': 'Marin County',
  'name': 'Abbotts Lagoon Trail',
  'park_department': 'National Park Service',
  'uri': 'northbayhikes/abbottslagoon.html'},
 {'area': 'southbayhikes',
  'county': 'Santa Clara County',
  'name': 'Acorn Trail',
  'park_department': None,
  'uri': 'southbayhikes/arastradero.html'},
 {'area': 'southbayhikes',
  'county': 'Santa Clara County',
  'name': 'Baylands Preserve',
  'park_department': None,
  'uri': 'southbayhikes/baylands.html'},
 {'area': 'southbayhikes',
  'county': 'Santa Clara County',
  'name': 'Hidden Villa',
  'park_department': None,
  'uri': 'southbayhikes/hiddenvilla.html'},
 {'area': 'southbayhikes',
  'county': 'Wunderlich County Park',
  'name': 'Alambique Trail',
  'park_department': 'County of San Mateo Parks Department',
  'uri': 'southbayhikes/wunderlich.html'},
 {'area': 'eastbayhikes',
  'county': 'Alameda County',
  'name': 'Coyote Hills Regional Park',
  'park_department': None,
  'uri': 'eastbayhikes/

In [244]:
with open('all_trails.csv', 'w') as csvfile:
    fieldnames = all_trails[1].keys()
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for trail in all_trails:
        writer.writerow(trail)

### Fetch Hike Sidebars

In [259]:
trails_paths = [t['uri'] for t in all_trails]
trails_paths

['northbayhikes/abbottslagoon.html',
 'southbayhikes/arastradero.html',
 'southbayhikes/baylands.html',
 'southbayhikes/hiddenvilla.html',
 'southbayhikes/wunderlich.html',
 'eastbayhikes/coyotehills.html',
 'eastbayhikes/quarrylakes.html',
 'eastbayhikes/diablofoothills.html',
 'southbayhikes/uvas.html',
 'northbayhikes/invalley.html',
 'southbayhikes/skylinealpine.html',
 'southbayhikes/coal.html',
 'southbayhikes/cmr.html',
 'northbayhikes/foothill.html',
 'northbayhikes/altobowl.html',
 'southbayhikes/russianridge.html',
 'southbayhikes/whsrhg.html',
 'southbayhikes/anonuevo2.html',
 'southbayhikes/butano.html',
 'southbayhikes/elsereno.html',
 'eastbayhikes/rockville.html',
 'eastbayhikes/mlkshoreline.html',
 'northbayhikes/hputnam.html',
 'northbayhikes/austincreek.html',
 'northbayhikes/rockspring.html',
 'northbayhikes/deerpark.html',
 'southbayhikes/purisimabald.html',
 'southbayhikes/sierraazul/sierraazul.html',
 'northbayhikes/sugarloaf.html',
 'northbayhikes/sptbarnabe.html

### Process Hike

In [372]:
def construct_url(hikepath, content='sidebar.html'):
    url_part_1 = hikepath[:hikepath.find('/')]
    url_part_2 = hikepath[hikepath.find('/')+1:hikepath.find('.')]
    url_part_3 = content
    return '/'.join([BASE_URL, url_part_1, url_part_2, url_part_3])


def fetch(item):
    print('fetching', item)
    response = requests.get(item)
    if response.status_code == 200:
        return response


def gen_filename(url_path):
    step_1 = url_path[url_path.find('.com/') + 5:]
    return step_1.replace('/', '_')


def filename_to_path(filename):
    step_1 = filename.replace('_sidebar', '')
    return step_1.replace('_', '/')


def find_by_text(soup, item):
    try:
        return soup.find(lambda x: x.text == item).nextSibling.nextSibling.nextSibling.strip()
    except Exception:
        pass



print(filename_to_path('northbayhikes_abbottslagoon_sidebar.html'))

northbayhikes/abbottslagoon.html


In [336]:
for trail in trails_paths[:2]:
    trail_url = construct_url(trail)
    try:
        res = fetch(trail_url)
        with open('raw/trails/' + gen_filename(trail_url), 'wb') as outfile:
            outfile.write(res.content)
    except Exception as e:
        print(e)
        
    time.sleep(2)

fetching http://bahiker.com/northbayhikes/abbottslagoon/sidebar.html
fetching http://bahiker.com/southbayhikes/arastradero/sidebar.html


In [337]:
glob('raw/trails/*.html')

['raw/trails/northbayhikes_abbottslagoon_sidebar.html',
 'raw/trails/southbayhikes_arastradero_sidebar.html']

In [397]:
trail_info = []
for file_name in glob('raw/trails/*.html')[:1]:
    with open(file_name, 'rb') as trail:
        sidebar = BeautifulSoup(trail, 'lxml')
        latitude = sidebar.findAll('kml')[0].text.strip()
        latitude = latitude[:latitude.find('"') + 2]
        longitude = sidebar.findAll('kml')[0].nextSibling.strip()
        traffic = find_by_text(sidebar, 'Trail traffic')
        hiking_time = find_by_text(sidebar, 'Hiking time')
        exposure = find_by_text(sidebar, 'Exposure')
        trail_info.append({
                'uri': filename_to_path(file_name),
                'longitude': longitude,
                'latitude': latitude,
                'traffic': traffic,
                'hiking_time': hiking_time,
                'exposure': exposure,
            })

trail_info

[{'exposure': 'Full sun.',
  'hiking_time': None,
  'latitude': '38° 7\'24.59"N',
  'longitude': '122°56\'8.49"W',
  'traffic': None,
  'uri': 'raw/trails/northbayhikes/abbottslagoon.html'}]

In [398]:
print(sidebar.find(lambda x: x.text == 'Exposure').nextSibling.nextSibling.nextSibling.strip())
print(find_by_text(sidebar, 'Exposure'))

Full sun.
Full sun.


In [431]:
FIELDS = ('exposure', 'trail surfaces',\
          'trail traffic', 'hiking time',\
          'season', 'gps coordinates* for trailhead',\
          'gas, food, and lodging', 'rules',\
          'the official story', 'trailhead details',\
          'map choices', 'distance, category, and difficulty')

for i in sidebar.findAll('i'):
    i.text.strip().lower()
    if i.text.strip().lower() in FIELDS:
        print(i.nextSibling.nextSibling.nextSibling.strip())

Very easy. This
Full sun.
Moderate.
Dirt trail that shifts to loose sand as the route reaches the turn-around 
point.
1 hour.
Good anytime, although often muddy in winter and early spring.
Latitude
Pay phone, stores, and restaurants back on Sir Francis Drake in Inverness. 
Gas in Point Reyes Station. There are overnight accommodations available 
on the eastern fringes of the park, including a handful of motels in Inverness, 
and numerous bed and breakfasts just off Sir Francis Drake. Point Reyes 
has several hike-in campgrounds -- inquire at the Point Reyes Ranger Station 
in Bear Valley, or read more about the options
No entrance or parking fees. Plenty of parking in a dirt lot. Pit toilets 
on site. There's a map under glass at the information signboard, but none 
to take with you. There are 2 designated handicapped parking spots, and 
the first 0.4 mile of the trail is wheelchair accessible. There is no direct 
public transit to this trailhead.
No dogs. No bikes. No horses

In [396]:
print(sidebar.find(lambda tag: tag.text == 'Hiking time').nextSibling.nextSibling.nextSibling.strip())
print(find_by_text(sidebar, 'Hiking time'))

1 1/2 hours.
1 1/2 hours.


In [403]:
sidebar

<html><body><i><font size="-1">In 
brief</font></i><font size="-1">:<br/>
2.3 mile out and back hike along a lagoon, leading to a sandy beach.<br/>
<br/>
<i>Distance, category, and difficulty</i>:<br/>
Very easy. This <b>2.3 mile</b> <b>out and back hike</b> with about 50 feet 
in elevation change is a good outing for beginners.<br/>
<br/>
<i>Exposure</i>: <br/>
Full sun.<br/>
<i><br/>
Trail traffic</i>:<br/>
Moderate.<i><br/>
<br/>
Trail surfaces</i>: <br/>
Dirt trail that shifts to loose sand as the route reaches the turn-around 
point.<br/>
<i><br/>
Hiking time</i>: <br/>
1 hour.<br/>
<i><br/>
Season</i>: <br/>
Good anytime, although often muddy in winter and early spring.<br/>
<br/>
<i>Getting there</i>:<br/>
<div class="google-maps">
<iframe allowfullscreen="" frameborder="0" height="240" src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d699.06832973644!2d-122.9360652417733!3d38.12366538038735!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x0000000000000000%3A0xbaaca1