In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from google.cloud import bigquery
from datetime import datetime, timedelta
import folium
from folium import plugins

class FireClusteringValidator:
    def __init__(self, project_id="code-for-planet"):
        self.client = bigquery.Client(project=project_id)
        self.project_id = project_id

    def load_data(self):
        """Load both fire events and stats data"""
        # load
        events_query = """
        SELECT * FROM `code-for-planet.emission_db.fire_events_2004`
        ORDER BY fire_event_id, fire_date
        """

        stats_query = """
        SELECT * FROM `code-for-planet.emission_db.fire_event_stats_2004`
        ORDER BY fire_event_id
        """

        self.fire_events = self.client.query(events_query).to_dataframe()
        self.fire_stats = self.client.query(stats_query).to_dataframe()

        print(f"Loaded {len(self.fire_events)} fire event records")
        print(f"Loaded {len(self.fire_stats)} fire event summaries")

        return self.fire_events, self.fire_stats

    def test_1_basic_sanity_checks(self):
        """Test 1: Basic data consistency checks"""
        print("=== TEST 1: Basic Sanity Checks ===")

        # check if all original data points are accounted for
        total_points_in_events = self.fire_stats['num_points'].sum()
        print(f"Total points in fire events: {total_points_in_events:,}")
        print(f"Expected from original: 239,612")
        print(f"Difference: {239612 - total_points_in_events}")

        # check for duplicate fire event IDs in events table
        duplicate_check = len(self.fire_events) == len(self.fire_events['id'].unique())
        print(f"No duplicate original IDs: {duplicate_check}")

        # check date ranges make sense
        min_date = self.fire_events['fire_date'].min()
        max_date = self.fire_events['fire_date'].max()
        print(f"Date range: {min_date} to {max_date}")

        # check coordinate ranges
        lon_range = f"{self.fire_events['longitude'].min():.2f} to {self.fire_events['longitude'].max():.2f}"
        lat_range = f"{self.fire_events['latitude'].min():.2f} to {self.fire_events['latitude'].max():.2f}"
        print(f"Longitude range: {lon_range}")
        print(f"Latitude range: {lat_range}")

        return total_points_in_events == 239612

    def test_2_temporal_consistency(self):
        """Test 2: Check if fire events have reasonable temporal patterns"""
        print("\n=== TEST 2: Temporal Consistency ===")

        issues = []

        for event_id in self.fire_stats['fire_event_id'].head(20):  # Test first 20 events
            event_data = self.fire_events[self.fire_events['fire_event_id'] == event_id]

            if len(event_data) == 0:
                continue

            # check temporal clustering - should not have huge gaps
            dates = pd.to_datetime(event_data['fire_date']).sort_values()
            date_gaps = (dates.shift(-1) - dates).dt.days.dropna()

            max_gap = date_gaps.max() if len(date_gaps) > 0 else 0
            duration = (dates.max() - dates.min()).days + 1

            # flag suspicious events
            if max_gap > 30:  # gap longer than 30 days
                issues.append(f"Event {event_id}: Large gap of {max_gap} days")

            if event_id < 10:  # details
                print(f"Event {event_id}: Duration {duration} days, Max gap {max_gap} days, {len(event_data)} points")

        if issues:
            print("Potential temporal issues:")
            for issue in issues[:5]:  # first 5 issues
                print(f"   {issue}")
        else:
            print("No major temporal inconsistencies found")

        return len(issues) == 0

    def test_3_spatial_consistency(self):
        """Test 3: Check if fire events are spatially coherent"""
        print("\n=== TEST 3: Spatial Consistency ===")

        def haversine_distance(lat1, lon1, lat2, lon2):
            """Calculate distance between two points in km"""
            R = 6371  # earths radius in km
            lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
            dlat, dlon = lat2 - lat1, lon2 - lon1
            a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
            return 2 * R * np.arcsin(np.sqrt(a))

        issues = []

        for event_id in self.fire_stats['fire_event_id'].head(20):
            event_data = self.fire_events[self.fire_events['fire_event_id'] == event_id]

            if len(event_data) <= 1:
                continue

            # calc pairwise distances within the fire event
            coords = event_data[['latitude', 'longitude']].values
            max_distance = 0

            for i in range(len(coords)):
                for j in range(i+1, len(coords)):
                    dist = haversine_distance(coords[i][0], coords[i][1],
                                            coords[j][0], coords[j][1])
                    max_distance = max(max_distance, dist)

            # check against the reported spatial extent
            reported_extent = self.fire_stats[self.fire_stats['fire_event_id'] == event_id]['spatial_extent_km'].iloc[0]

            if event_id < 10:
                print(f"Event {event_id}: Calculated max distance {max_distance:.1f} km, Reported {reported_extent:.1f} km")

            # flag if calcd and reported differ
            if abs(max_distance - reported_extent) > max_distance * 0.1:  # more than 10% difference
                issues.append(f"Event {event_id}: Distance mismatch")

        if issues:
            print("Potential spatial issues:")
            for issue in issues[:5]:
                print(f"   {issue}")
        else:
            print("Spatial calculations appear consistent")

        return len(issues) == 0

    def test_4_examine_suspicious_events(self):
        """Test 4: Look at potentially problematic fire events"""
        print("\n=== TEST 4: Examining Suspicious Events ===")

        # very large fires (might be multiple fires incorrectly merged)
        large_fires = self.fire_stats[self.fire_stats['spatial_extent_km'] > 1000].sort_values('spatial_extent_km', ascending=False)
        print(f"Events with >1000km extent: {len(large_fires)}")

        if len(large_fires) > 0:
            print("Largest fire events:")
            print(large_fires[['fire_event_id', 'num_points', 'duration_days', 'spatial_extent_km', 'total_area_burned']].head())

        # very long duration fires (might be separate fire seasons)
        long_fires = self.fire_stats[self.fire_stats['duration_days'] > 60].sort_values('duration_days', ascending=False)
        print(f"\nEvents lasting >60 days: {len(long_fires)}")

        if len(long_fires) > 0:
            print("Longest duration events:")
            print(long_fires[['fire_event_id', 'num_points', 'duration_days', 'spatial_extent_km']].head())

        # events with very few points but large extent (might be noise)
        sparse_fires = self.fire_stats[(self.fire_stats['num_points'] < 10) &
                                      (self.fire_stats['spatial_extent_km'] > 100)]
        print(f"\nSparse events (<10 points, >100km): {len(sparse_fires)}")

        return large_fires, long_fires, sparse_fires

    def test_5_visualize_sample_events(self, event_ids=[0, 1, 7]):
        """Test 5: Create maps of specific fire events for visual validation"""
        print(f"\n=== TEST 5: Events {event_ids} ===")

        for event_id in event_ids:
            event_data = self.fire_events[self.fire_events['fire_event_id'] == event_id]

            if len(event_data) == 0:
                print(f"No data for event {event_id}")
                continue

            # stats
            stats = self.fire_stats[self.fire_stats['fire_event_id'] == event_id].iloc[0]
            print(f"\nEvent {event_id}:")
            print(f"  Points: {stats['num_points']}, Duration: {stats['duration_days']} days")
            print(f"  Extent: {stats['spatial_extent_km']:.1f} km")
            print(f"  Date range: {event_data['fire_date'].min()} to {event_data['fire_date'].max()}")

            # folium - come back to this**** not working (maybe env)
            center_lat = event_data['latitude'].mean()
            center_lon = event_data['longitude'].mean()

            m = folium.Map(location=[center_lat, center_lon], zoom_start=8)

            # add fire points colored by date
            dates = pd.to_datetime(event_data['fire_date'])
            date_range = (dates.max() - dates.min()).days + 1

            for idx, row in event_data.iterrows():
                # color by relative date (blue = early, red = late)
                days_from_start = (pd.to_datetime(row['fire_date']) - dates.min()).days
                color_intensity = days_from_start / max(date_range - 1, 1)
                color = plt.cm.coolwarm(color_intensity)
                color_hex = f"#{int(color[0]*255):02x}{int(color[1]*255):02x}{int(color[2]*255):02x}"

                folium.CircleMarker(
                    location=[row['latitude'], row['longitude']],
                    radius=3,
                    popup=f"Date: {row['fire_date']}<br>CO2: {row['ECO2']:.1f}",
                    color=color_hex,
                    fillColor=color_hex,
                    fillOpacity=0.7
                ).add_to(m)

            # save map
            map_filename = f"fire_event_{event_id}_map.html"
            m.save(map_filename)
            print(f"  Map saved as: {map_filename}")

    def run_all_tests(self):
        """Run all validation tests"""
        print("FIRE CLUSTERING VALIDATION REPORT")
        print("=" * 50)

        # load
        self.load_data()

        # tests
        test1_pass = self.test_1_basic_sanity_checks()
        test2_pass = self.test_2_temporal_consistency()
        test3_pass = self.test_3_spatial_consistency()
        large_fires, long_fires, sparse_fires = self.test_4_examine_suspicious_events()
        self.test_5_visualize_sample_events()

        # assessment
        print("\n" + "=" * 50)
        print("OVERALL ASSESSMENT:")

        issues = []
        if not test1_pass:
            issues.append("Data inconsistencies found")
        if not test2_pass:
            issues.append("Temporal clustering issues")
        if not test3_pass:
            issues.append("Spatial calculation issues")
        if len(large_fires) > 5:
            issues.append(f"{len(large_fires)} very large fires (may be over-clustered)")
        if len(long_fires) > 3:
            issues.append(f"{len(long_fires)} very long fires (may span seasons)")

        if len(issues) == 0:
            print("CLUSTERING APPEARS SUCCESSFUL!")
            print("   - Data is consistent")
            print("   - Spatial and temporal patterns look reasonable")
            print("   - Ready to proceed with analysis")
        else:
            print("POTENTIAL ISSUES DETECTED:")
            for issue in issues:
                print(f"   - {issue}")
            print("   - Consider adjusting clustering parameters")
            print("   - Or investigate suspicious events manually")


validator = FireClusteringValidator()
validator.run_all_tests()

FIRE CLUSTERING VALIDATION REPORT
Loaded 239612 fire event records
Loaded 187 fire event summaries
=== TEST 1: Basic Sanity Checks ===
Total points in fire events: 239,612
Expected from original: 239,612
Difference: 0
No duplicate original IDs: True
Date range: 2003-12-31 to 2004-12-31
Longitude range: -124.29 to -67.23
Latitude range: 25.22 to 49.00

=== TEST 2: Temporal Consistency ===
Event 0: Duration 31 days, Max gap 2.0 days, 487 points
Event 1: Duration 4 days, Max gap 1.0 days, 9 points
Event 2: Duration 7 days, Max gap 1.0 days, 25 points
Event 3: Duration 22 days, Max gap 1.0 days, 181 points
Event 4: Duration 6 days, Max gap 1.0 days, 19 points
Event 5: Duration 3 days, Max gap 1.0 days, 3 points
Event 6: Duration 7 days, Max gap 1.0 days, 68 points
Event 7: Duration 30 days, Max gap 1.0 days, 8194 points
Event 8: Duration 1 days, Max gap 0.0 days, 4 points
Event 9: Duration 11 days, Max gap 2.0 days, 100 points
No major temporal inconsistencies found

=== TEST 3: Spatial Co