# BFRO Site Scraper

This notebook is for scraping Bigfoot sighting data from Bigfoot Field Research Organization's report database found here http://www.bfro.net/gdb/.

In [332]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random

In [198]:
# Grab html from the bfro geographic database page 
response = requests.get("http://www.bfro.net/gdb")
soup = BeautifulSoup(response.content, 'html.parser')

# Grab all href from main page
links = soup.find_all('a', href=True)

# Grab state and providence links for us and canada
# Store separate because states have county links before sighting report links
canada_links = []
us_state_links = []

base_url = "http://www.bfro.net"

for a in links:
    if "state=ca-" in a['href']:
        canada_links.append(base_url + a['href'])
    elif "state=int" in a['href']:
        pass
    elif "state" in a['href']:
        us_state_links.append(base_url + a['href'])

assert len(us_state_links) == 49
assert len(canada_links) == 9

In [199]:
# Loop through US state links and grab all the county links
us_county_links = []

for url in us_state_links:
    
    response = requests.get(url)
    assert response.ok
    
    soup = BeautifulSoup(response.content, 'html.parser')
    anchor_tags = soup.find_all('a', href=lambda href: href and "county" in href)
    
    if anchor_tags:
        for a in anchor_tags:
            us_county_links.append("http://www.bfro.net/gdb/" + a['href'])
        
    


In [323]:
# pull out links for the report pages
def get_report_urls(urls):
    report_urls = []
    for url in urls:
        response = requests.get(url)
        assert response.ok
        soup = BeautifulSoup(response.content, 'html.parser')
        anchor_tags = soup.find_all('a', href=lambda href: href and 'show_report.asp?id' in href)
    
        if anchor_tags:
            for a in anchor_tags:
                report_urls.append("http://www.bfro.net/gdb/" + a['href'])
    return report_urls

report_urls = get_report_urls(us_county_links) + get_report_urls(canada_links)


In [424]:
len(report_urls)

5345

In [422]:
def scrape_report_data(url):
    
    report_dict = {}
    
    try:
        response = requests.get(url)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        raise SystemExit(e)
    
    soup = BeautifulSoup(response.content, 'html.parser')
 
    
    # Extract the header information stored in span with class    
    html_class = ['reportheader', 'reportclassification']
    for c in html_class:
        element = soup.find('span', {'class': c})
        if element:
            report_dict[c] = element.text.strip()
        else:
            report_dict[c] = "did not find "
        
    # Extract other details
    def extract(text):
        if text == "LOCATION DETAILS":
            element = soup.find('span', {'class':'field'}, string=text)
            if element:
                return element.parent.text.strip()
            else: 
                return "did not find"
        else:
            element = soup.find('span', {'class': 'field'}, string=text)
            if element:
                return element.parent.text.replace(text, "").strip()
            else:
                return ""

    year = extract('YEAR:')
    season = extract('SEASON:')
    month = extract('MONTH:')
    state = extract("STATE:")
    county = extract("COUNTY:")
    nearest_town = extract("NEAREST TOWN:")
    observed = extract("OBSERVED:")
    also_noticed = extract("ALSO NOTICED:")
    other_witnesses = extract("OTHER WITNESSES:")
    other_stories = extract("OTHER STORIES:")
    time_and_conditions = extract("TIME AND CONDITIONS:")
    environment = extract("ENVIRONMENT:")
    country = extract("COUNTRY:")
    province = extract("PROVINCE:")
    location_details = extract("LOCATION DETAILS:")
    
    report_dict['year'] = year
    report_dict['season'] = season
    report_dict['month'] = month
    report_dict['state'] = state
    report_dict['county'] = county
    report_dict['nearest_town'] = nearest_town
    report_dict['observed'] = observed
    report_dict['also_noticed'] = also_noticed
    report_dict['other_witnesses'] = other_witnesses
    report_dict['other_stories'] = other_stories
    report_dict['time_and_conditions'] = time_and_conditions
    report_dict['environment'] = environment
    report_dict['country'] = country
    report_dict['province'] = province
    report_dict['location_details'] = location_details
    
    return report_dict

In [None]:
# For all sighting report urls, grab html and retrieve data, put into a dictionary
# and append to a list of all results 
report_data = []
for url in report_urls:
    report_data.append(scrape_report_data(url))

4
