# BFRO Site Scraper

This notebook is for scraping Bigfoot sighting data from Bigfoot Field Research Organization's report database found here http://www.bfro.net/gdb/.

In [332]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random

In [198]:
# Grab html from the bfro geographic database page 
response = requests.get("http://www.bfro.net/gdb")
soup = BeautifulSoup(response.content, 'html.parser')

# Grab all href from main page
links = soup.find_all('a', href=True)

# Grab state and providence links for us and canada
# Store separate because states have county links before report links
canada_links = []
us_state_links = []

base_url = "http://www.bfro.net"

for a in links:
    if "state=ca-" in a['href']:
        canada_links.append(base_url + a['href'])
    elif "state=int" in a['href']:
        pass
    elif "state" in a['href']:
        us_state_links.append(base_url + a['href'])

assert len(us_state_links) == 49
assert len(canada_links) == 9

In [199]:
# Loop through state links and grab all the county links

us_county_links = []

for url in us_state_links:
    
    response = requests.get(url)
    assert response.ok
    
    soup = BeautifulSoup(response.content, 'html.parser')
    anchor_tags = soup.find_all('a', href=lambda href: href and "county" in href)
    
    if anchor_tags:
        for a in anchor_tags:
            us_county_links.append("http://www.bfro.net/gdb/" + a['href'])
        
    


In [323]:
# pull out links for the report pages
def get_report_urls(urls):
    report_urls = []
    for url in urls:
        response = requests.get(url)
        assert response.ok
        soup = BeautifulSoup(response.content, 'html.parser')
        anchor_tags = soup.find_all('a', href=lambda href: href and 'show_report.asp?id' in href)
    
        if anchor_tags:
            for a in anchor_tags:
                report_urls.append("http://www.bfro.net/gdb/" + a['href'])
    return report_urls

report_urls = get_report_urls(us_county_links) + get_report_urls(canada_links)


In [326]:
report_urls[300]

'http://www.bfro.net/gdb/show_report.asp?id=12266'

In [390]:
def scrape_report_data(url):
    
    report_dict = {}
    
    try:
        response = requests.get(url)
    except requests.exceptions.RequestException as e:  # This is the correct syntax
        raise SystemExit(e)
    
    soup = BeautifulSoup(response.content, 'html.parser')
 
    
    # Extract the header information stored in span with class    
    html_class = ['reportheader', 'reportclassification']
    for c in html_class:
        element = soup.find('span', {'class': c})
        if element:
            report_dict[c] = element.text.strip()
        else:
            report_dict[c] = "did not find "
        
    # Extract other details
    def extract(text):
        if text == "LOCATION DETAILS":
            element = soup.find('span', {'class':'field'}, string=text)
            if element:
                return element.parent.text.strip()
            else: 
                return "did not find"
        else:
            element = soup.find('span', {'class': 'field'}, string=text)
            if element:
                return element.parent.text.replace(text, "").strip()
            else:
                return ""

    year = extract('YEAR:')
    season = extract('SEASON:')
    month = extract('MONTH:')
    state = extract("STATE:")
    county = extract("COUNTY:")
    nearest_town = extract("NEAREST TOWN:")
    observed = extract("OBSERVED:")
    also_noticed = extract("ALSO NOTICED:")
    other_witnesses = extract("OTHER WITNESSES:")
    other_stories = extract("OTHER STORIES:")
    time_and_conditions = extract("TIME AND CONDITIONS:")
    environment = extract("ENVIRONMENT:")
    country = extract('COUNTRY:')
    province = extract('PROVINCE:')
    location_details = extract('LOCATION DETAILS:')
    
    report_dict['year'] = year
    report_dict['season'] = season
    report_dict['month'] = month
    report_dict['state'] = state
    report_dict['county'] = county
    report_dict['nearest_town'] = nearest_town
    report_dict['observed'] = observed
    report_dict['also_noticed'] = also_noticed
    report_dict['other_witnesses'] = other_witnesses
    report_dict['other_stories'] = other_stories
    report_dict['time_and_conditions'] = time_and_conditions
    report_dict['environment'] = environment
    report_dict['country'] = country
    report_dict['province'] = province
    report_dict['location_details'] = location_details
    
    return report_dict

In [391]:
test_urls = random.sample(report_urls, 3)
test_data = []
for url in test_urls:
    test_data.append(scrape_report_data(url))

In [393]:
test_data[1]

{'reportheader': 'Report # 55554',
 'reportclassification': '(Class B)',
 'year': '2016',
 'season': 'Fall',
 'month': 'September',
 'state': 'West Virginia',
 'county': 'Pocahontas County',
 'nearest_town': 'Marlinton, WV',
 'observed': 'My wife and I were taking an evening drive on Highland Scenic Highway 150, from US 219 near Slatyfork, WV, to the Cranberry Glades area on Tuesday evening 9/6/16.  We had been stopping at the scenic overlooks taking pictures, and had just left the Williams River overlook, and headed south on 150.  There was a stretch of highway that was very straight and a downslope of about 4-6% about 1-2 miles south of the Williams River overlook.  As we were driving down, we both noticed something in the weeds between the road edge and the bank leading up to the woods on the west side.  At first I thought it was a deer, with its head and neck stretched up above the weeds.  It was about 300-400 feet ahead, and while we continued towards it, it spun around, similar t

In [290]:
report

{'report_number': '40345',
 'classification': '(Class B)',
 'location_details': 'LOCATION DETAILS:',
 'year': '2013',
 'season': 'Winter',
 'month': 'March',
 'state': '',
 'county': '',
 'nearest_town': 'Campbell River, BC',
 'observed': "At approx 5:00 pm March 12th 2013, Campbell River, B.C. my husband and I were out getting a load of firewood on a logging road North of the city. We had an experience in the woods neither of us are able to explain. We are both very familiar with the sounds and local animals as we have grown up on the West Coast. My husband being a West Cost logger, knowing the woods well, and myself a camper all my life. My husband started off by chopping wood lower down on the logging road. It was overcast and raining lightly at the time. We moved our way up the road gathering firewood.  When we got to the top my husband turned the truck around and once again got out to cut wood, when we heard a whooping two-toned call from the woods to one side of us.  It was sound

['http://www.bfro.net/GDB/show_county_reports.asp?state=ca-al',
 'http://www.bfro.net/GDB/show_county_reports.asp?state=ca-bc',
 'http://www.bfro.net/GDB/show_county_reports.asp?state=ca-mn',
 'http://www.bfro.net/GDB/show_county_reports.asp?state=ca-nb',
 'http://www.bfro.net/GDB/show_county_reports.asp?state=ca-ns',
 'http://www.bfro.net/GDB/show_county_reports.asp?state=ca-on',
 'http://www.bfro.net/GDB/show_county_reports.asp?state=ca-qu',
 'http://www.bfro.net/GDB/show_county_reports.asp?state=ca-sk',
 'http://www.bfro.net/GDB/show_county_reports.asp?state=ca-yu']

Unnamed: 0,fish,veg
0,salmon,kale
1,cod,
