In [1]:
import polars as pl
import datapane as dp
import altair as alt

In [2]:
reports = pl.scan_csv("../pipeline/data/processed/bfro_reports_geocoded.csv")

In [3]:
reports.head().collect()

observed,location_details,county,state,season,title,latitude,longitude,date,number,classification,hexid,temperature_high,temperature_mid,temperature_low,dew_point,humidity,cloud_cover,moon_phase,precip_intensity,precip_probability,precip_type,pressure,summary,conditions,uv_index,visibility,wind_bearing,wind_speed
str,str,str,str,str,str,f64,f64,str,i64,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,str,str,f64,f64,f64,f64
"""hello. some ti…","""Off of rt 2 in…","""Ohio County""","""West Virginia""","""Winter""","""Report 10006: …",40.1333,-80.68192,"""2004-12-12""",10006,"""Class A""","""8a2a86362d47ff…",39.3,34.7,32.1,29.5,81.5,97.5,0.02,0.031,100.0,"""[""rain""]""",1006.3,"""Cloudy skies t…","""Rain, Overcast…",,9.1,231.4,18.3
"""I may have alr…","""The place we c…","""Grant County""","""Oregon""","""Summer""","""Report 10324: …",44.7751,-118.7717,"""1997-07-04""",10324,"""Class B""","""8a28aa64998fff…",89.7,68.6,46.5,45.4,52.7,25.0,0.0,0.472,100.0,"""[""rain""]""",1015.9,"""Partly cloudy …","""Rain, Partiall…",,20.1,42.6,8.1
"""My husband and…","""There is a bac…","""Coffey County""","""Kansas""","""Fall""","""Report 10660: …",38.3908,-95.6491,"""1974-11-09""",10660,"""Class A""","""8a26e528ac4fff…",49.5,46.5,39.6,44.2,91.7,83.6,0.85,0.283,100.0,"""[""rain""]""",1017.5,"""Partly cloudy …","""Rain, Partiall…",,5.5,159.7,17.2
"""I live in hend…","""we are off of …","""Henderson Coun…","""Tennessee""","""Winter""","""Report 10740: …",35.80762,-88.30945,"""2005-02-09""",10740,"""Class B""","""8a2648545ba7ff…",47.9,43.3,35.0,41.3,92.9,100.0,0.03,0.012,100.0,"""[""rain""]""",1018.1,"""Cloudy skies t…","""Rain, Overcast…",,5.9,312.4,18.3
"""Around five ye…","""Longitude-87.9…","""Jasper County""","""Illinois""","""Spring""","""Report 10879: …",38.94564,-87.97622,"""2000-05-06""",10879,"""Class B""","""8a264596a177ff…",80.7,70.3,60.9,62.1,77.4,16.3,0.09,,0.0,,1017.8,"""Clear conditio…","""Clear""",,7.7,186.1,21.9


In [4]:
report_classifications = reports.groupby("classification").count().collect()
report_classifications

classification,count
str,u32
"""Class C""",33
"""Class A""",2579
"""Class B""",2668


In [26]:
from typing import Dict
classification_numbers: Dict[str, dp.BigNumber] = {}
for classification, count in report_classifications.iter_rows():
     classification_numbers[classification] = dp.BigNumber(
         heading=f"Number of {classification} Sightings",
         value=count
     )

In [27]:
dp.Group(
    classification_numbers["Class A"],
    classification_numbers["Class B"],
    classification_numbers["Class C"],
    dp.Text("""
    Class A reports involve clear sightings in circumstances where misinterpretation or misidentification of other animals can be ruled out with greater confidence.
    """),
    dp.Text("""
    Incidents where a possible sasquatch was observed at a great distance or in poor lighting conditions and incidents in any other circumstance that did not afford a clear view of the subject are considered Class B reports.
    """),
    dp.Text("""
    Most second-hand reports, and any third-hand reports, or stories with an untraceable sources, are considered Class C, because of the high potential for inaccuracy.
    """),
    columns=3
)

In [7]:
from vega_datasets import data
states = alt.topo_feature(data.us_10m.url, feature="states")
background = alt.Chart(
    states
).mark_geoshape(
    fill="lightgray",
    stroke="white"
).properties(
    height=300,
    width=500
).project("albersUsa")

report_locations = reports.filter(
    pl.col("latitude").is_not_null() &
    pl.col("longitude").is_not_null() &
    pl.col("state").is_not_null()
).select(
    pl.col("latitude"),
    pl.col("longitude"),
    pl.col("classification")
).collect().to_pandas()

points = alt.Chart(report_locations).mark_circle(
    size=10,
).encode(
    longitude="longitude:Q", 
    latitude="latitude:Q",
    color="classification"
)

bfro_map = (background + points).properties(
    title="Bigfoot Sightings"
)

bfro_map

In [24]:
reports_per_year = reports.filter(
    pl.col("date").str.contains("\d{4}-\d{2}-\d{2}")
).with_columns(
    sighting_year=pl.col("date").str.to_datetime("%Y-%m-%d").dt.year()
).groupby(
    "sighting_year",
    "classification"
).count().filter(
    pl.col("sighting_year") >= 2003
).collect()

reports_per_year_chart = alt.Chart(
    reports_per_year.to_pandas()
).mark_bar().encode(
    x=alt.X("sighting_year:O", title="Year"),
    y=alt.Y("count:Q", title="Number of Sightings"),
    color=alt.Color("classification:N", title="Classification"),
    tooltip=alt.Tooltip("count:Q")
).properties(
    title="Sightings by Year"
)

reports_per_year_chart

In [25]:
sightings_by_state = (
    reports
    .filter(pl.col("state").is_not_null())
    .groupby("state", "classification")
    .count()
    .sort(by="count", descending=True)
    .collect()
)


sightings_by_state_chart = (
    alt.Chart(sightings_by_state.to_pandas())
    .mark_bar()
    .encode(
        x=alt.X("state:N", title="State", sort="-y"),
        y=alt.Y("count:Q", title="Number of Sightings"),
        color=alt.Color("classification:N", title="Classification"),
        tooltip=alt.Tooltip("count:Q")
    )
    .properties(
        title="Sightings by State"
    )
)

sightings_by_state_chart


In [28]:
report_seasons = (
    reports
    .filter(pl.col("season").is_not_null())
    .groupby("season")
    .count()
    .collect()
)

report_seasons_dict: Dict[str, dp.BigNumber] = dict()

for season, count in report_seasons.iter_rows():
    report_seasons_dict[season] = dp.BigNumber(
        value=count,
        heading=f"Sightings in {season}"
    )
    
dp.Group(
    report_seasons_dict["Spring"],
    report_seasons_dict["Summer"],
    report_seasons_dict["Fall"],
    report_seasons_dict["Winter"],
    columns=4
)