# BFRO Site Scraper

This notebook is for scraping Bigfoot sighting data from Bigfoot Field Research Organization's report page HTML found here http://www.bfro.net/gdb/. This involves first requesting the HTML from the '/gdb' page that is the main page for the database of records, and pulling all anchor tags with an href. For Canada this page has the links for provinces and for the US links for every state except Hawaii. We have to then loop through all the state links and get links for each county in each state, then loop through each county to pull out all report links. For Canada the report links are right on the provence pages and are not broken down further. Canadian and US report links are combined, then we loop through and request the page for each report and pull out the data fields from the HTML.
Data is pulled from every report and stored in a list of dictionaries which is converted to a Pandas DataFrame then stored as a CSV file for further cleaning and EDA.

In [1]:
# Import libraries
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random

In [198]:
base_url = "http://www.bfro.net"

# Request the bfro geographical database page
response = requests.get(base_url +"/gdb")
soup = BeautifulSoup(response.content, 'html.parser')

# Grab all href elements
links = soup.find_all('a', href=True)

# Grab state and providence links for US and Canada.
# These are stored separately as the US links will require 
# one more round of scraping than Canada
canada_links = []
us_state_links = []

for a in links:
    if "state=ca-" in a['href']:
        canada_links.append(base_url + a['href'])
    elif "state=int" in a['href']:
        pass
    elif "state" in a['href']:
        us_state_links.append(base_url + a['href'])

assert len(us_state_links) == 49
assert len(canada_links) == 9

In [199]:
# Loop through US state links and grab all the county links
us_county_links = []

for url in us_state_links:
    
    response = requests.get(url)
    assert response.ok
    
    soup = BeautifulSoup(response.content, 'html.parser')
    anchor_tags = soup.find_all('a', href=lambda href: href and "county" in href)
    
    if anchor_tags:
        for a in anchor_tags:
            us_county_links.append("http://www.bfro.net/gdb/" + a['href'])
        
    


In [323]:
# pull out links for the report pages
def get_report_urls(urls):
    report_urls = []
    for url in urls:
        response = requests.get(url)
        assert response.ok
        soup = BeautifulSoup(response.content, 'html.parser')
        anchor_tags = soup.find_all('a', href=lambda href: href and 'show_report.asp?id' in href)
    
        if anchor_tags:
            for a in anchor_tags:
                report_urls.append("http://www.bfro.net/gdb/" + a['href'])
    return report_urls

# Combine all US county links with the Canadian links.
# This leaves a list of all urls for every report on the bfro website
report_urls = get_report_urls(us_county_links) + get_report_urls(canada_links)


In [424]:
len(report_urls)

5345

In [422]:
def scrape_report_data(url):
    
    report_dict = {}
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the header information stored in span elements with class  
    # of either reportheader or reportclassification.
    html_class = ['reportheader', 'reportclassification']
    for c in html_class:
        element = soup.find('span', {'class': c})
        if element:
            report_dict[c] = element.text.strip()
        else:
            report_dict[c] = "did not find "
        
    # Extract other details
    def extract(text):
        if text == "LOCATION DETAILS":
            element = soup.find('span', {'class':'field'}, string=text)
            if element:
                return element.parent.text.strip()
            else: 
                return "did not find"
        else:
            element = soup.find('span', {'class': 'field'}, string=text)
            if element:
                return element.parent.text.replace(text, "").strip()
            else:
                return ""

    year = extract('YEAR:')
    season = extract('SEASON:')
    month = extract('MONTH:')
    state = extract("STATE:")
    county = extract("COUNTY:")
    nearest_town = extract("NEAREST TOWN:")
    observed = extract("OBSERVED:")
    also_noticed = extract("ALSO NOTICED:")
    other_witnesses = extract("OTHER WITNESSES:")
    other_stories = extract("OTHER STORIES:")
    time_and_conditions = extract("TIME AND CONDITIONS:")
    environment = extract("ENVIRONMENT:")
    country = extract("COUNTRY:")
    province = extract("PROVINCE:")
    location_details = extract("LOCATION DETAILS:")
    
    report_dict['year'] = year
    report_dict['season'] = season
    report_dict['month'] = month
    report_dict['state'] = state
    report_dict['county'] = county
    report_dict['nearest_town'] = nearest_town
    report_dict['observed'] = observed
    report_dict['also_noticed'] = also_noticed
    report_dict['other_witnesses'] = other_witnesses
    report_dict['other_stories'] = other_stories
    report_dict['time_and_conditions'] = time_and_conditions
    report_dict['environment'] = environment
    report_dict['country'] = country
    report_dict['province'] = province
    report_dict['location_details'] = location_details
    
    return report_dict

In [425]:
# For all sighting report urls, grab html and retrieve data, put into a dictionary
# and append to a list of all results 
# this takes a while 
report_data = []
for url in report_urls:
    report_data.append(scrape_report_data(url))

In [427]:
len(report_data)

5345

In [429]:
df = pd.DataFrame.from_dict(report_data)
df.sample(10)

In [435]:
# Write this raw data to a csv
df.to_csv('data/bfro_raw.csv')