In [None]:
# -----------------------------------------------------------
# QC - emission_2004_with_fire_events
# -----------------------------------------------------------
#   • Verifies basic coverage
#   • Confirms 1-to-1 mapping (each detection -> one fire_event_id)
#   • Flags spatial / temporal outliers that break your clustering thresholds
# -----------------------------------------------------------

from google.cloud import bigquery
import pandas as pd

PROJECT   = "code-for-planet"
DATASET   = "emission_db"
TABLE     = "emission_2004_with_fire_events"
RAW_TABLE = "emission_2004"
THRESH_KM = 20
THRESH_D  = 3

client = bigquery.Client(project=PROJECT)

# coverage & basic sanity
coverage_sql = f"""
WITH base AS (
  SELECT
    COUNT(*) AS n_rows,
    COUNTIF(fire_event_id != -1) AS n_assigned,
    COUNTIF(fire_event_id = -1)  AS n_unassigned,
    COUNT(DISTINCT fire_event_id) AS n_fire_events_incl_noise
  FROM `{PROJECT}.{DATASET}.{TABLE}`
)
SELECT
  n_rows,
  n_assigned,
  n_unassigned,
  n_fire_events_incl_noise - 1 AS n_fire_events     -- minus the -1 bucket
FROM base
"""
coverage = client.query(coverage_sql).to_dataframe()
print("COVERAGE CHECK")
display(coverage)

#  1-to-1 mapping test
#  every original detection ID should map to exactly one fire_event_id
mapping_sql = f"""
SELECT
  COUNT(*) AS total_ids,
  COUNTIF(cnt > 1) AS ids_with_multiple_events
FROM (
  SELECT id, COUNT(DISTINCT fire_event_id) AS cnt
  FROM `{PROJECT}.{DATASET}.{TABLE}`
  GROUP BY id
)
"""
mapping = client.query(mapping_sql).to_dataframe()
assert mapping["ids_with_multiple_events"].iat[0] == 0, "Some ids map to >1 fire_event_id"
print("MAPPING CHECK passed: each id belongs to one fire_event_id")

# spatial cohesion test  (bounding-box diagonal in km)
spatial_sql = f"""
WITH perimeters AS (
  SELECT
    fire_event_id,
    -- Build two opposite corners of the bounding box
    ST_DISTANCE(
      ST_GEOGPOINT(MIN(longitude), MIN(latitude)),
      ST_GEOGPOINT(MAX(longitude), MAX(latitude))
    ) / 1000 AS max_km          -- metres → kilometres
  FROM `{PROJECT}.{DATASET}.{TABLE}`
  WHERE fire_event_id != -1
  GROUP BY fire_event_id
)
SELECT *
FROM perimeters
WHERE max_km > {THRESH_KM}
ORDER BY max_km DESC
LIMIT 10
"""
spatial_outliers = client.query(spatial_sql).to_dataframe()
print("SPATIAL OUTLIERS (max_km > threshold)")
display(spatial_outliers)

# temporal cohesion test
temporal_sql = f"""
WITH spans AS (
  SELECT
    fire_event_id,
    DATE_DIFF(MAX(fire_date), MIN(fire_date), DAY) AS span_days
  FROM `{PROJECT}.{DATASET}.{TABLE}`
  WHERE fire_event_id != -1
  GROUP BY fire_event_id
)
SELECT *
FROM spans
WHERE span_days > {THRESH_D}
ORDER BY span_days DESC
LIMIT 10
"""
temporal_outliers = client.query(temporal_sql).to_dataframe()
print("TEMPORAL OUTLIERS (span_days > threshold)")
display(temporal_outliers)


# sample a suspect fire_event_id
if not spatial_outliers.empty:
    suspect_id = int(spatial_outliers["fire_event_id"].iat[0])
    print(f"EXAMPLE POINTS FROM fire_event_id = {suspect_id}")
    sample_sql = f"""
      SELECT id, fire_date, longitude, latitude, ECO2, area_burned
      FROM `{PROJECT}.{DATASET}.{TABLE}`
      WHERE fire_event_id = {suspect_id}
      ORDER BY fire_date
      LIMIT 20
    """
    sample = client.query(sample_sql).to_dataframe()
    display(sample)


COVERAGE CHECK


Unnamed: 0,n_rows,n_assigned,n_unassigned,n_fire_events
0,239663,239195,468,1209


MAPPING CHECK passed: each id belongs to one fire_event_id
SPATIAL OUTLIERS (max_km > threshold)


Unnamed: 0,fire_event_id,max_km
0,1060,1455.987962
1,1072,1429.963243
2,994,1341.211518
3,973,1261.509604
4,1006,1251.656934
5,980,1239.489823
6,981,1206.150793
7,542,1176.260325
8,995,1157.369454
9,1071,1138.006721


TEMPORAL OUTLIERS (span_days > threshold)


Unnamed: 0,fire_event_id,span_days
0,1060,230
1,56,225
2,542,175
3,973,149
4,980,121
5,29,98
6,682,80
7,1072,78
8,544,67
9,981,64


EXAMPLE POINTS FROM fire_event_id = 1060


Unnamed: 0,id,fire_date,longitude,latitude,ECO2,area_burned
0,100675,2004-01-01,-80.8883,26.4711,0.0,0.0
1,100599,2004-01-01,-80.8887,26.4689,0.0,0.0
2,100753,2004-01-01,-80.8879,26.4734,0.0,0.0
3,100676,2004-01-01,-80.8858,26.4708,0.0,0.0
4,101932,2004-01-01,-80.7858,26.4952,0.0,0.0
5,102041,2004-01-01,-80.7878,26.4978,0.0,0.0
6,101726,2004-01-01,-80.7964,26.4922,0.0,0.0
7,101826,2004-01-01,-80.796,26.4944,0.0,0.0
8,101825,2004-01-01,-80.7985,26.4948,0.0,0.0
9,101931,2004-01-01,-80.7882,26.4956,0.0,0.0
