In [5]:
# Import Dependencies
import requests
from bs4 import BeautifulSoup
import logging
import pandas as pd
import json
import matplotlib.pyplot as plt
import googlemaps
import pandas as pd
from dotenv import load_dotenv
import os

In [6]:
# Obtain environment variables
load_dotenv()
GOOGLE_MAPS_API_KEY = os.getenv('GOOGLE_MAPS_API_KEY')

# Initalize the Google Maps API Client (Replace SECRET with the actual key)
gmaps = googlemaps.Client(key=GOOGLE_MAPS_API_KEY)

In [7]:
# Load the data we have
bigfoot_data = pd.read_json('../data/raw_scraping_data.json')
bigfoot_data.head()

Unnamed: 0,Report Number,Report Class,year,season,month,state,county,location details,nearest town,nearest road,observed,also noticed,other witnesses,other stories,time and conditions,environment,date,a & g references
0,Report # 13038,(Class A),2004,Winter,February,Alaska,Anchorage County,Up near powerline clearings east of Potter Mar...,Anchorage / Hillside,No real roads in the area,I and two of my friends were bored one night s...,"Some tracks in the snow, and a clearing in the...",My two friends were snowmachining behind me bu...,I have not heard of any other incidents in Anc...,Middle of the night. The only light was the he...,"In the middle of the woods, in a clearing cove...",,
1,Report # 8792,(Class B),2003,Winter,December,Alaska,Anchorage County,"Few houses on the way, a power relay station. ...",Anchorage,Dowling,"Me and a couple of friends had been bored, whe...","We smelled of colonge and after shave, and one...","4. Me, w-man, warren and sean. We were at my h...",no,"Started at 11, ended at about 3-3:30. Weather ...","A pine forest, with a bog or swamp on the righ...",Friday night,
2,Report # 1255,(Class B),1998,Fall,September,Alaska,Bethel County,"45 miles by air west of Lake Iliamna, Alaska i...",,,My hunting buddy and I were sitting on a ridge...,nothing unusual,Scouting for caribou with high quality binoculars,,,Call Iliamna Air taxi for lat & Long of Long L...,3,
3,Report # 11616,(Class B),2004,Summer,July,Alaska,Bristol Bay County,"Approximately 95 miles east of Egegik, Alaska....",Egegik,,"To whom it may concern, I am a commercial fish...",Just these foot prints and how obvious it was ...,"One other witness, and he was fishing prior to...","I've only heard of one other story, from an ol...","Approximately 12:30 pm, partially coudy/sunny.","Lake front,creek spit, gravel and sand, alder ...",20,
4,Report # 637,(Class A),2000,Summer,June,Alaska,Cordova-McCarthy County,"On the main trail toward the glacier, before t...","Kennikot, Alaska",not sure,My hiking partner and I arrived late to the Ke...,I did hear what appeared to be grunting in the...,"I was the only witness, there was one other in...",,About 12:00 Midnight / full moon / clear / dim...,This sighting was located at approximately 1 t...,16,


In [13]:


# Configure logging
logging.basicConfig(
    filename='logs/bigfoot_updates.log',  # Log file name
    filemode='a',  # Append mode
    level=logging.DEBUG,  # Log everything from DEBUG and above
    format='%(asctime)s - %(levelname)s - %(message)s'
)

# Bring over Functions 
def soupify_website(url):
    """
    Fetches the HTML content of a given URL and parses it into a BeautifulSoup object.

    Parameters:
        url (str): The URL of the webpage to be scraped.

    Returns:
        BeautifulSoup: A BeautifulSoup object representing the parsed HTML of the webpage.
    
    Raises:
        ValueError: If the HTTP response status is not 200(OK)
    """
    
    # test connection
    response = requests.get(url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        return soup
    else:
        raise ValueError(f"Failed to fetch {url}: Status code {response.status_code}")

def get_report_links():
    """
    Extracts all report numbers from links from the Sighting Reports Recently Added page of the BFRO
    database. 

    Parameters:
        report_soup: A list of BeautifulSoup objects to search for links.

    Returns:
        list: A list of hyperlink strings (`href` values) extracted from the provided BeautifulSoup objects.
    
    Notes:
        - Only links with an `href` attribute will be included.
        - Duplicate links are not removed; the returned list may contain duplicates.
    """
    link_list = []
    report_soup = soupify_website('https://www.bfro.net/GDB/newadd.asp?Show=AB')
    links = report_soup.find_all('a')
    for link in links:
        if link.get('href') and 'show_report.asp?id=' in link.get('href') :
            url = link.get('href').split('show_report.asp?id=')[1]
            link_list.append(url)
    return link_list

def create_sighting_dictionary(url):
    """
    Extracts information from an individual report page and returns it as a dictionary.

    Parameters:
        url (str): The URL of the report page.

    Returns:
        dict: A dictionary with the extracted information, or None if the page cannot be parsed.
    """
    logging.info(f"Fetching report from: {url}")
    report = soupify_website(url)

    if not report:
        logging.error(f"Failed to parse report page: {url}")
        return None

    try:
        report_dict = {}

        # Extract 'Report Number'
        report_header = report.find('span', class_='reportheader')
        report_dict['Report Number'] = report_header.text.strip() if report_header else 'N/A'

        # Extract 'Report Classification'
        report_class = report.find('span', class_='reportclassification')
        report_dict['Report Class'] = report_class.text.strip() if report_class else 'N/A'

        # Extract additional fields
        fields = report.find_all('span', class_='field')
        for field in fields:
            # Get the full text of the parent element
            text = field.parent.text.strip()

            # Only process fields in the format "Header: Value"
            if ':' in text:
                # Split into field name and value
                field_name, value = text.split(':', 1)

                # Clean up field name and value
                field_name = field_name.strip().lower()
                value = value.strip()

                # Validate input (ensure no line breaks in the value)
                if len(value.split('\n')) == 1:
                    # Store the field and value in the dictionary
                    report_dict[field_name] = value

        return report_dict

    except Exception as e:
        logging.error(f"Error processing report {url}: {e}")
        return None
    
def geocode_with_fallback(row):
    try:
        # Try nearest_town, state
        location = gmaps.geocode(f"{row['nearest_town']}, {row['state']}")
        if location:
            return location[0]['geometry']['location']['lat'], location[0]['geometry']['location']['lng']

        # Fallback to county, state
        location = gmaps.geocode(f"{row['county']}, {row['state']}")
        if location:
            return location[0]['geometry']['location']['lat'], location[0]['geometry']['location']['lng']

        # If neither works, return None
        return None, None
    except Exception as e:
        print(f"Error geocoding {row['nearest_town']}, {row['state']} or {row['county']}, {row['state']}: {e}")
        return None, None
# Load raw data json 
bigfoot_data = pd.read_json('../data/raw_scraping_data.json')
new_sightings_url = 'https://www.bfro.net/GDB/newadd.asp?Show=AB'

In [14]:
latest_reports = get_report_links()
# Check our database for these reports
old_reports = bigfoot_data['Report Number'].str.split('Report # ').apply(lambda x: x[1] if len(x) > 1 else None).tolist()
# Ensure latest_reports are comparable
latest_reports = [str(x) for x in latest_reports]

new_reports = [x for x in latest_reports if x not in old_reports]

In [15]:
new_report_dicts = []
for report in new_reports:
    new_report_dicts.append(create_sighting_dictionary(f'https://www.bfro.net/GDB/show_report.asp?id={report}'))
new_report_df = pd.DataFrame(new_report_dicts)
new_report_df.head()

Unnamed: 0,Report Number,Report Class,year,season,month,date,state,county,location details,nearest town,nearest road,observed,also noticed,other witnesses,other stories,time and conditions,environment
0,Report # 77879,(Class A),2024,Fall,November,8,Virginia,Buchanan County,a well traveled road that is near the coal min...,no very far from town,page rd,I saw this at around 200-230 am I was running ...,As stated above there were 2 deer nearby that ...,one (myself),I've heard stories but I have never spoken to ...,roughly 200 am very clear and bright night was...,Wooded saddle of the mountain. It was near a s...
1,Report # 76281,(Class B),2023,Summer,July,18,Massachusetts,Worcester County,Mt Wachusett is next to Leominster State Forest,Mt Wachusett,On the ski slope of Mt Wachusett,"First, I am a skeptic but I am willing to be o...",,no,"After I mentioned my experience, a woman told ...",dusk,slope of the ski slope. Mixed type of forest t...
2,Report # 77818,(Class A),2024,Fall,October,20th,Montana,Hill County,Was on the Rocky Boy Reservation on Sandy Cree...,Box Elder,Sandy creek rd,"Sunday night my brother, his wife, my girlfrie...",We seen eyes in the bushes about ten minutes b...,"My brother. His wife, my girlfriend and myself.","Yes, there was an incident that happened back ...","It was about 11 or 11:30 pm maybe, but it was ...",Forest. Not too thick of vegetation tho. It wa...
3,Report # 77933,(Class A),2024,Fall,October,10/26/2024,Tennessee,Monroe County,Specific directions omitted to allow a follow-...,Tellico Plains,,My report for my visual sighting of a possible...,Right after the sighting I remember hearing a ...,No other witnesses,This area has reportedly had a history of acti...,2:15pm in the afternoon. It was sunny day with...,"Lush forest mixed with pines, vines and leafy ..."


In [16]:
updated_raw = pd.concat([bigfoot_data, new_report_df], ignore_index=True)

In [17]:
updated_raw.to_json('raw_scraping_data.json', orient='records')

We have a dataframe of our new entries, now to clean them

In [18]:
new_report_df.columns = new_report_df.columns.str.replace(' ','_').str.lower()
new_report_df['report_number'] = new_report_df.loc[:, 'report_number'].apply(
  lambda x: pd.to_numeric(x.split('Report # ')[1]) if isinstance(x, str) and 'Report # ' in x else x)

new_report_df['report_class'] = new_report_df.loc[:, 'report_class'].apply(
    lambda x: x[6:-1].strip() if len(x) > 1 else x
)

# Standardize dates
new_report_df['date'] = pd.to_datetime(
    new_report_df['date'], errors='coerce', format='%Y-%m-%d'
)

# Clean up environment
new_report_df['environment'] = new_report_df['environment'].str.lower()

# Check for appropriate year
new_report_df = new_report_df[new_report_df['year'].str.match(r'^\d{4}$', na=False)]

# Handle missing values 
new_report_df.fillna({'nearest_town': 'Unknown', 'nearest_road': 'Unknown'}, inplace=True)

Finally, we add the geocoding

In [19]:
new_report_df[['latitude', 'longitude']] = new_report_df.apply(
    lambda row: pd.Series(geocode_with_fallback(row)), axis=1
)

In [21]:
bigfoot_df = pd.read_json('../data/bigfoot_coords_df.json')
bigfoot_df.head()

Unnamed: 0,report_number,report_class,year,season,month,state,county,location_details,nearest_town,nearest_road,observed,also_noticed,other_witnesses,other_stories,time_and_conditions,environment,date,a_&_g_references,latitude,longitude
0,13038,A,2004,Winter,February,Alaska,Anchorage County,Up near powerline clearings east of Potter Mar...,Anchorage / Hillside,No real roads in the area,I and two of my friends were bored one night s...,"Some tracks in the snow, and a clearing in the...",My two friends were snowmachining behind me bu...,I have not heard of any other incidents in Anc...,Middle of the night. The only light was the he...,"In the middle of the woods, in a clearing cove...",,,61.119996,-149.74543
1,8792,B,2003,Winter,December,Alaska,Anchorage County,"Few houses on the way, a power relay station. ...",Anchorage,Dowling,"Me and a couple of friends had been bored, whe...","We smelled of colonge and after shave, and one...","4. Me, w-man, warren and sean. We were at my h...",no,"Started at 11, ended at about 3-3:30. Weather ...","A pine forest, with a bog or swamp on the righ...",Friday night,,61.217576,-149.899678
2,1255,B,1998,Fall,September,Alaska,Bethel County,"45 miles by air west of Lake Iliamna, Alaska i...",,,My hunting buddy and I were sitting on a ridge...,nothing unusual,Scouting for caribou with high quality binoculars,,,Call Iliamna Air taxi for lat & Long of Long L...,3,,63.588753,-154.493062
3,11616,B,2004,Summer,July,Alaska,Bristol Bay County,"Approximately 95 miles east of Egegik, Alaska....",Egegik,,"To whom it may concern, I am a commercial fish...",Just these foot prints and how obvious it was ...,"One other witness, and he was fishing prior to...","I've only heard of one other story, from an ol...","Approximately 12:30 pm, partially coudy/sunny.","Lake front,creek spit, gravel and sand, alder ...",20,,58.213737,-157.374253
4,637,A,2000,Summer,June,Alaska,Cordova-McCarthy County,"On the main trail toward the glacier, before t...","Kennikot, Alaska",not sure,My hiking partner and I arrived late to the Ke...,I did hear what appeared to be grunting in the...,"I was the only witness, there was one other in...",,About 12:00 Midnight / full moon / clear / dim...,This sighting was located at approximately 1 t...,16,,61.486389,-142.886389


In [22]:
coordinates_df = pd.concat([bigfoot_df, new_report_df],ignore_index=True)
coordinates_df.tail()

Unnamed: 0,report_number,report_class,year,season,month,state,county,location_details,nearest_town,nearest_road,observed,also_noticed,other_witnesses,other_stories,time_and_conditions,environment,date,a_&_g_references,latitude,longitude
4778,77933,A,2024,Fall,October,Tennessee,Monroe County,Specific directions omitted to allow a follow-...,Tellico Plains,Unknown,My report for my visual sighting of a possible...,Right after the sighting I remember hearing a ...,No other witnesses,This area has reportedly had a history of acti...,2:15pm in the afternoon. It was sunny day with...,"lush forest mixed with pines, vines and leafy ...",,,35.362855,-84.294087
4779,77879,A,2024,Fall,November,Virginia,Buchanan County,a well traveled road that is near the coal min...,no very far from town,page rd,I saw this at around 200-230 am I was running ...,As stated above there were 2 deer nearby that ...,one (myself),I've heard stories but I have never spoken to ...,roughly 200 am very clear and bright night was...,wooded saddle of the mountain. it was near a s...,,,37.431573,-78.656894
4780,76281,B,2023,Summer,July,Massachusetts,Worcester County,Mt Wachusett is next to Leominster State Forest,Mt Wachusett,On the ski slope of Mt Wachusett,"First, I am a skeptic but I am willing to be o...",,no,"After I mentioned my experience, a woman told ...",dusk,slope of the ski slope. mixed type of forest t...,,,42.489172,-71.887042
4781,77818,A,2024,Fall,October,Montana,Hill County,Was on the Rocky Boy Reservation on Sandy Cree...,Box Elder,Sandy creek rd,"Sunday night my brother, his wife, my girlfrie...",We seen eyes in the bushes about ten minutes b...,"My brother. His wife, my girlfriend and myself.","Yes, there was an incident that happened back ...","It was about 11 or 11:30 pm maybe, but it was ...",forest. not too thick of vegetation tho. it wa...,,,48.317208,-110.013263
4782,77933,A,2024,Fall,October,Tennessee,Monroe County,Specific directions omitted to allow a follow-...,Tellico Plains,Unknown,My report for my visual sighting of a possible...,Right after the sighting I remember hearing a ...,No other witnesses,This area has reportedly had a history of acti...,2:15pm in the afternoon. It was sunny day with...,"lush forest mixed with pines, vines and leafy ...",,,35.362855,-84.294087


In [24]:
coordinates_df.to_json('../data/bigfoot_coords_df.json', orient='records')

In [25]:
final_cols = ['report_number', 'report_class', 'state', 'county', 'latitude',
       'nearest_town', 'longitude', 'season', 'month', 'observed', 'year']

coordinates_df = coordinates_df[final_cols]

coordinates_df.to_json('../data/bigfoot_coordinates_clean_cols.json', orient='records')
