In [481]:
from bs4 import BeautifulSoup, Tag
import numpy as np
from string import printable, ascii_letters, punctuation, digits
import requests
import csv
import time
from glob import glob

## Helper Functions

In [467]:
BASE_URL = 'http://bahiker.com'

AREAS = {
    'northbayhikes': 'North Bay',
    'southbayhikes': 'South Bay',
    'sfhikes': 'San Francisco',
    'eastbayhikes': 'East Bay',
}

DEPARTMENTS = {
    'CCWD':'Contra Costa Water District',
    'CSMP':'County of San Mateo Parks Department ',
    'CSP':'California State Parks',
    'EBRPD':'East Bay Regional Park District',
    'GGNRA':'Golden Gate National Recreation Area',
    'MCOSD':'Marin County Open Space District',
    'MROSD':'Midpeninsula Regional Open Space District',
    'MMWD':'Marin Municipal Water District',
    'NPS':'National Park Service',
    'SCCP':'Santa Clara County Parks',
    'SLT':'Solano Land Trust',
}

HIKE_INFO_PATHS = {
    'title': '/title.html',
    'location': '/location.html',
    'sidebar': '/sidebar.html',
}


def clean_text(instr):
    acceptable_letters = ascii_letters + ' ' + punctuation + digits
    return ''.join([x if x in acceptable_letters else '' for x in instr])


def get_county(inelem):
    if type(inelem) == str:
        if 'county' in [e.lower() for e in inelem.split()]:
            return inelem.strip()
    elif type(inelem) == list:
        for elem in inelem:
            if get_county(elem):
                return elem.strip()


def get_department(inelem):
    if type(inelem) == str:
        department = DEPARTMENTS.get(inelem.strip())
        if department:
            return department.strip()            
    elif type(inelem) == list:
        for elem in inelem:
            department = get_department(elem)
            if department:
                return department

## Process Raw
### All Trails

In [241]:
soup = ''
with open('raw/trails.html', 'rb') as trails:
    soup = BeautifulSoup(trails, 'lxml')

In [242]:
all_trails = []
failures = []
for trail in soup.findAll('a'):
    # Ignore homepage links
    if trail.attrs['href'] != '/':
        try:
            extra_info = [clean_text(e) for e in trail.next.next.split(',')][1:]
        except TypeError as t:
            failures.append(t)
        all_trails.append({
                'uri': trail.attrs['href'],
                'name': clean_text(trail.text),
                'area': trail.attrs['href'][:trail.attrs['href'].find('/')],
                'county': get_county(extra_info),
                'park_department': get_department(extra_info),
            })

In [243]:
all_trails

[{'area': 'northbayhikes',
  'county': 'Marin County',
  'name': 'Abbotts Lagoon Trail',
  'park_department': 'National Park Service',
  'uri': 'northbayhikes/abbottslagoon.html'},
 {'area': 'southbayhikes',
  'county': 'Santa Clara County',
  'name': 'Acorn Trail',
  'park_department': None,
  'uri': 'southbayhikes/arastradero.html'},
 {'area': 'southbayhikes',
  'county': 'Santa Clara County',
  'name': 'Baylands Preserve',
  'park_department': None,
  'uri': 'southbayhikes/baylands.html'},
 {'area': 'southbayhikes',
  'county': 'Santa Clara County',
  'name': 'Hidden Villa',
  'park_department': None,
  'uri': 'southbayhikes/hiddenvilla.html'},
 {'area': 'southbayhikes',
  'county': 'Wunderlich County Park',
  'name': 'Alambique Trail',
  'park_department': 'County of San Mateo Parks Department',
  'uri': 'southbayhikes/wunderlich.html'},
 {'area': 'eastbayhikes',
  'county': 'Alameda County',
  'name': 'Coyote Hills Regional Park',
  'park_department': None,
  'uri': 'eastbayhikes/

In [244]:
with open('all_trails.csv', 'w') as csvfile:
    fieldnames = all_trails[1].keys()
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for trail in all_trails:
        writer.writerow(trail)

### Fetch Hike Sidebars

In [259]:
trails_paths = [t['uri'] for t in all_trails]
trails_paths

['northbayhikes/abbottslagoon.html',
 'southbayhikes/arastradero.html',
 'southbayhikes/baylands.html',
 'southbayhikes/hiddenvilla.html',
 'southbayhikes/wunderlich.html',
 'eastbayhikes/coyotehills.html',
 'eastbayhikes/quarrylakes.html',
 'eastbayhikes/diablofoothills.html',
 'southbayhikes/uvas.html',
 'northbayhikes/invalley.html',
 'southbayhikes/skylinealpine.html',
 'southbayhikes/coal.html',
 'southbayhikes/cmr.html',
 'northbayhikes/foothill.html',
 'northbayhikes/altobowl.html',
 'southbayhikes/russianridge.html',
 'southbayhikes/whsrhg.html',
 'southbayhikes/anonuevo2.html',
 'southbayhikes/butano.html',
 'southbayhikes/elsereno.html',
 'eastbayhikes/rockville.html',
 'eastbayhikes/mlkshoreline.html',
 'northbayhikes/hputnam.html',
 'northbayhikes/austincreek.html',
 'northbayhikes/rockspring.html',
 'northbayhikes/deerpark.html',
 'southbayhikes/purisimabald.html',
 'southbayhikes/sierraazul/sierraazul.html',
 'northbayhikes/sugarloaf.html',
 'northbayhikes/sptbarnabe.html

### Process Hike

In [501]:
def construct_url(hikepath, content='sidebar.html'):
    url_part_1 = hikepath[:hikepath.find('/')]
    url_part_2 = hikepath[hikepath.find('/')+1:hikepath.find('.')]
    url_part_3 = content
    return '/'.join([BASE_URL, url_part_1, url_part_2, url_part_3])


def fetch(item):
    print('fetching', item)
    response = requests.get(item)
    if response.status_code == 200:
        return response


def gen_filename(url_path):
    step_1 = url_path[url_path.find('.com/') + 5:]
    return step_1.replace('/', '_')


def filename_to_path(filename):
    step_1 = filename.replace('_sidebar', '')
    return step_1.replace('_', '/')


def find_by_text(soup, item):
    try:
        return soup.find(lambda x: x.text == item).nextSibling.nextSibling.nextSibling.strip()
    except Exception:
        pass

    
def get_this_childs_text(bs):
    text = ''
    for child in bs:
        if isinstance(child, Tag):
            text += ' ' + child.text
    
    return text
    

print(filename_to_path('northbayhikes_abbottslagoon_sidebar.html'))

northbayhikes/abbottslagoon.html


In [519]:
logger = []
for trail in trails_paths:
    trail_url = construct_url(trail)
    try:
        res = fetch(trail_url)
        with open('raw/trails/' + gen_filename(trail_url), 'wb') as outfile:
            logger.append(trail_url)
            outfile.write(res.content)
    except Exception as e:
        print(e)
        
    time.sleep(10)

fetching http://bahiker.com/northbayhikes/abbottslagoon/sidebar.html
fetching http://bahiker.com/southbayhikes/arastradero/sidebar.html
fetching http://bahiker.com/southbayhikes/baylands/sidebar.html
fetching http://bahiker.com/southbayhikes/hiddenvilla/sidebar.html
fetching http://bahiker.com/southbayhikes/wunderlich/sidebar.html
fetching http://bahiker.com/eastbayhikes/coyotehills/sidebar.html
fetching http://bahiker.com/eastbayhikes/quarrylakes/sidebar.html
fetching http://bahiker.com/eastbayhikes/diablofoothills/sidebar.html
fetching http://bahiker.com/southbayhikes/uvas/sidebar.html
fetching http://bahiker.com/northbayhikes/invalley/sidebar.html
fetching http://bahiker.com/southbayhikes/skylinealpine/sidebar.html
fetching http://bahiker.com/southbayhikes/coal/sidebar.html
fetching http://bahiker.com/southbayhikes/cmr/sidebar.html
fetching http://bahiker.com/northbayhikes/foothill/sidebar.html
fetching http://bahiker.com/northbayhikes/altobowl/sidebar.html
fetching http://bahiker.c

In [520]:
glob('raw/trails/*.html')

['raw/trails/eastbayhikes_bcbriones_sidebar.html',
 'raw/trails/eastbayhikes_bdm_sidebar.html',
 'raw/trails/eastbayhikes_bishop_sidebar.html',
 'raw/trails/eastbayhikes_bortmeadow_sidebar.html',
 'raw/trails/eastbayhikes_briones_sidebar.html',
 'raw/trails/eastbayhikes_carquinez_sidebar.html',
 'raw/trails/eastbayhikes_carquinezwest_sidebar.html',
 'raw/trails/eastbayhikes_ccct_sidebar.html',
 'raw/trails/eastbayhikes_coyotehills_sidebar.html',
 'raw/trails/eastbayhikes_delvalle_sidebar.html',
 'raw/trails/eastbayhikes_dewf_sidebar.html',
 'raw/trails/eastbayhikes_diablofoothills_sidebar.html',
 'raw/trails/eastbayhikes_garin_sidebar.html',
 'raw/trails/eastbayhikes_hiddenbrooke_sidebar.html',
 'raw/trails/eastbayhikes_hrs_sidebar.html',
 'raw/trails/eastbayhikes_huckleberry_sidebar.html',
 'raw/trails/eastbayhikes_joaquinmiller_sidebar.html',
 'raw/trails/eastbayhikes_kengrove_sidebar.html',
 'raw/trails/eastbayhikes_lafres_sidebar.html',
 'raw/trails/eastbayhikes_lastrampas_sidebar.

In [547]:
FIELDS = ('exposure', 'trail surfaces',\
          'trail traffic', 'hiking time',\
          'season', 'gas, food, and lodging', 'rules',\
          'trailhead details', 'map choices',\
          'distance, category, and difficulty')
trail_info = []
for file_name in glob('raw/trails/*.html')[:1]:
    with open(file_name, 'rb') as trail:
        sidebar = BeautifulSoup(trail, 'lxml')
        row = {
            'uri': filename_to_path(file_name.replace('raw/trails/', '')),
        }
        for i in sidebar.findAll('i'):
            i.text.strip().lower()
            if i.text.strip().lower() in FIELDS:
                print([s for s in i.strings])
                try:
                    val = clean_text(i.nextSibling.nextSibling.nextSibling)
                except Exception:
                    val = ''
                row[i.text.strip().lower()] = val
        trail_info.append(row)
        
trail_info

['Distance, category, and difficulty']
['Exposur', 'e']
['Exposur', 'e']
['Exposur', 'e']
['Exposur', 'e']
['\n', '\r\nTrail traffic']
['\n', '\r\nTrail traffic']
['\n', '\r\nTrail traffic']
['\n', '\r\nTrail traffic']
['\r\nTrail traffic']
['\n', '\r\nTrail surfaces']
['\n', '\r\nTrail surfaces']
['\n', '\r\nTrail surfaces']
['\n', '\r\nTrail surfaces']
['\n', '\r\nTrail surfaces']
['\n', '\r\nHiking time']
['\r\nHiking time']
['\r\nHiking time']
['\r\nHiking time']
['\r\nHiking time']
['\n', '\r\nSeason']
['\n', '\r\nSeason']
['\n', '\r\nSeason']
['\n', '\r\nSeason']
['\n', '\r\nSeason']
['Gas, food, and lodging']
['Trailhead details']
['Rules']


[{'distance, category, and difficulty': 'This ',
  'exposure': '',
  'gas, food, and lodging': 'Pay phone, stores, gas, and restaurants back near CA 24 in Orinda. No individual camp sites in the park, although there is a group camp. Nearest campgrounds are in ',
  'hiking time': '',
  'rules': 'Most trails are multi-use. A few are open to equestrians and hikers only, and one trail is designated hiking only. Dogs are permitted. Park is open from 8 a.m. to 10 p.m.',
  'season': '',
  'trail surfaces': '',
  'trail traffic': '',
  'trailhead details': 'Parking fee of $5 charged when entrance kiosk is staffed. $2 dog fee. Lots of parking. Pit toilets on site, but no drinking water. Maps available at the information signboard near the start of the trail. There is no direct public transportation to this trailhead, but you can walk (or cycle) into the park from BART: visit ',
  'uri': 'eastbayhikes/bcbriones.html'}]