In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from google.cloud import bigquery
import warnings
warnings.filterwarnings('ignore')

class GeographicFireEventClustering:
    def __init__(self, project_id, dataset_id):
        self.client = bigquery.Client(project=project_id)
        self.dataset_id = dataset_id
        self.project_id = project_id

    def haversine_distance(self, lat1, lon1, lat2, lon2):
        """Calculate haversine distance between two points in kilometers"""
        R = 6371
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        return R * c

    def define_geographic_regions(self):
        """Define geographic regions to process separately"""
        regions = {
            'Pacific_West': {'lat_min': 32, 'lat_max': 49, 'lon_min': -125, 'lon_max': -115},
            'Mountain_West': {'lat_min': 32, 'lat_max': 49, 'lon_min': -115, 'lon_max': -105},
            'Great_Plains': {'lat_min': 32, 'lat_max': 49, 'lon_min': -105, 'lon_max': -95},
            'Midwest': {'lat_min': 37, 'lat_max': 49, 'lon_min': -95, 'lon_max': -85},
            'Southeast': {'lat_min': 25, 'lat_max': 37, 'lon_min': -95, 'lon_max': -75},
            'Northeast': {'lat_min': 37, 'lat_max': 49, 'lon_min': -85, 'lon_max': -67},
            'South_Texas': {'lat_min': 25, 'lat_max': 32, 'lon_min': -105, 'lon_max': -95},
            'Florida': {'lat_min': 25, 'lat_max': 32, 'lon_min': -95, 'lon_max': -80}
        }
        return regions

    def get_region_parameters(self, region_name):
        """Get optimized parameters for each region based on fire patterns"""
        region_params = {
            'Pacific_West': {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5},
            'Mountain_West': {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5},
            'Great_Plains': {'spatial_eps': 0.003, 'temporal_days': 2, 'min_samples': 5},  # tighter
            'Midwest': {'spatial_eps': 0.003, 'temporal_days': 2, 'min_samples': 5},      # tighter
            'Southeast': {'spatial_eps': 0.002, 'temporal_days': 1, 'min_samples': 5},    # tighter
            'Northeast': {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5},
            'South_Texas': {'spatial_eps': 0.003, 'temporal_days': 2, 'min_samples': 5},  # tighter
            'Florida': {'spatial_eps': 0.002, 'temporal_days': 1, 'min_samples': 5}       # tighter
        }
        return region_params.get(region_name, {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5})

    def process_geographic_region(self, year, region_name, region_bounds):
        """Process fire events for a specific geographic region with region-optimized parameters"""

        # region-specific parameters
        params = self.get_region_parameters(region_name)
        spatial_eps = params['spatial_eps']
        temporal_days = params['temporal_days']
        min_samples = params['min_samples']

        table_name = f"emission_{year}"

        print(f"\nProcessing region: {region_name}")
        print(f"   Bounds: {region_bounds}")
        print(f"   Parameters: eps={spatial_eps}° (~{spatial_eps*111:.1f}km), temporal={temporal_days}d, min_samples={min_samples}")

        query = f"""
        SELECT
            id, year, doy, longitude, latitude, fire_date,
            grid10k, covertype, fuelcode, area_burned,
            consumed_fuel, ECO2, burn_source, burnday_source
        FROM `{self.project_id}.{self.dataset_id}.{table_name}`
        WHERE longitude IS NOT NULL
        AND latitude IS NOT NULL
        AND fire_date IS NOT NULL
        AND latitude >= {region_bounds['lat_min']}
        AND latitude < {region_bounds['lat_max']}
        AND longitude >= {region_bounds['lon_min']}
        AND longitude < {region_bounds['lon_max']}
        ORDER BY fire_date, longitude, latitude
        """

        df = self.client.query(query).to_dataframe()

        if len(df) == 0:
            print(f"   No data found for {region_name}")
            return pd.DataFrame()

        print(f"   Found {len(df):,} records")

        df['fire_date'] = pd.to_datetime(df['fire_date'])

        clustered_df = self._cluster_region_data(df, spatial_eps, temporal_days, min_samples)

        if len(clustered_df) > 0:
            clustered_df['region'] = region_name
            unique_events = len(clustered_df['fire_event_id'].unique())
            print(f"   Found {unique_events} fire events in {region_name}")
            return clustered_df
        else:
            print(f"   No fire events found in {region_name}")
            return pd.DataFrame()

    def _cluster_region_data(self, df, spatial_eps, temporal_days, min_samples):
        """Apply clustering to data from a single geographic region"""

        coords = df[['longitude', 'latitude']].values
        coords_rad = np.radians(coords)

        dbscan = DBSCAN(eps=spatial_eps, min_samples=min_samples, metric='haversine')
        spatial_labels = dbscan.fit_predict(coords_rad)
        df['spatial_cluster'] = spatial_labels

        fire_events = []
        event_id = 0

        for spatial_cluster in df['spatial_cluster'].unique():
            if spatial_cluster == -1:
                continue

            cluster_data = df[df['spatial_cluster'] == spatial_cluster].copy()
            cluster_data = cluster_data.sort_values('fire_date')

            cluster_data['temp_group'] = 0
            current_group = 0

            for i in range(1, len(cluster_data)):
                prev_date = cluster_data.iloc[i-1]['fire_date']
                curr_date = cluster_data.iloc[i]['fire_date']

                if (curr_date - prev_date).days > temporal_days:
                    current_group += 1

                cluster_data.iloc[i, cluster_data.columns.get_loc('temp_group')] = current_group

            for temp_group in cluster_data['temp_group'].unique():
                group_data = cluster_data[cluster_data['temp_group'] == temp_group].copy()
                group_data['fire_event_id'] = event_id
                fire_events.append(group_data)
                event_id += 1

        if fire_events:
            result_df = pd.concat(fire_events, ignore_index=True)
            result_df = result_df.drop(['spatial_cluster', 'temp_group'], axis=1)
            return result_df
        else:
            return pd.DataFrame()

    def process_year_geographic(self, year):
        """Process entire year using geographic chunking approach with region-specific parameters"""

        print(f"GEOGRAPHIC CLUSTERING for {year}")
        print("Using region-specific optimized parameters:")

        regions = self.define_geographic_regions()

        for region_name in regions.keys():
            params = self.get_region_parameters(region_name)
            print(f"  {region_name}: eps={params['spatial_eps']}° (~{params['spatial_eps']*111:.1f}km), temporal={params['temporal_days']}d")

        print("=" * 80)

        regions = self.define_geographic_regions()
        all_regional_events = []
        global_event_id = 0

        for region_name, region_bounds in regions.items():
            regional_events = self.process_geographic_region(
                year, region_name, region_bounds
            )

            if len(regional_events) > 0:
                max_regional_id = regional_events['fire_event_id'].max()
                regional_events['fire_event_id'] += global_event_id
                global_event_id += max_regional_id + 1

                all_regional_events.append(regional_events)

        if all_regional_events:
            combined_events = pd.concat(all_regional_events, ignore_index=True)

            print(f"\nFINAL RESULTS:")
            print(f"Total fire events: {len(combined_events['fire_event_id'].unique())}")
            print(f"Total data points: {len(combined_events):,}")
            print(f"Coverage: {len(combined_events)/239663*100:.1f}% of original data")

            return combined_events
        else:
            return pd.DataFrame()

    def analyze_fire_events(self, fire_events_df):
        """Analyze the identified fire events"""
        if len(fire_events_df) == 0:
            return pd.DataFrame()

        event_stats = fire_events_df.groupby('fire_event_id').agg({
            'id': 'count',
            'fire_date': ['min', 'max'],
            'longitude': ['min', 'max'],
            'latitude': ['min', 'max'],
            'area_burned': 'sum',
            'consumed_fuel': 'sum',
            'ECO2': 'sum',
            'region': 'first'
        }).reset_index()

        event_stats.columns = ['fire_event_id', 'num_points', 'start_date', 'end_date',
                              'min_lon', 'max_lon', 'min_lat', 'max_lat',
                              'total_area_burned', 'total_consumed_fuel', 'total_ECO2', 'region']

        event_stats['duration_days'] = (event_stats['end_date'] - event_stats['start_date']).dt.days + 1
        event_stats['spatial_extent_km'] = event_stats.apply(
            lambda row: self.haversine_distance(row['min_lat'], row['min_lon'],
                                              row['max_lat'], row['max_lon']), axis=1
        )

        return event_stats

    def save_results_to_csv(self, fire_events_df, filename):
        """Save results to CSV file"""
        fire_events_df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")

def main():
    print("GEOGRAPHIC FIRE CLUSTERING)
    print("This approach processes data by geographic regions to prevent")
    print("continent-spanning 'mega-fires' caused by temporal chunking.")
    print()

    clustering = GeographicFireEventClustering(
        project_id="code-for-planet",
        dataset_id="emission_db"
    )

    year = 2004
    fire_events = clustering.process_year_geographic(year=year)

    if len(fire_events) > 0:
        stats = clustering.analyze_fire_events(fire_events)
        print(f"\nGEOGRAPHIC CLUSTERING RESULTS:")
        print(f"Fire events found: {len(stats)}")
        print(f"Average duration: {stats['duration_days'].mean():.1f} days")
        print(f"Average spatial extent: {stats['spatial_extent_km'].mean():.1f} km")
        print(f"Max spatial extent: {stats['spatial_extent_km'].max():.1f} km")

        large_events = stats[stats['spatial_extent_km'] > 500]
        very_large_events = stats[stats['spatial_extent_km'] > 1000]

        print(f"\nQUALITY CHECK:")
        print(f"Events >500km: {len(large_events)} (target: <5)")
        print(f"Events >1000km: {len(very_large_events)} (target: 0)")

        if len(large_events) > 0:
            print(f"\nRemaining large events by region:")
            large_by_region = large_events.groupby('region')['fire_event_id'].count()
            print(large_by_region)

        clustering.save_results_to_csv(fire_events, f"fire_events_{year}_geographic_optimized.csv")
        clustering.save_results_to_csv(stats, f"fire_event_stats_{year}_geographic_optimized.csv")

        print(f"\nResults saved:")
        print(f"- fire_events_{year}_geographic_optimized.csv")
        print(f"- fire_event_stats_{year}_geographic_optimized.csv")

        print(f"\nResults by region:")
        region_summary = stats.groupby('region').agg({
            'fire_event_id': 'count',
            'spatial_extent_km': 'mean',
            'duration_days': 'mean'
        }).round(1)
        print(region_summary)

    else:
        print("No fire events found")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from google.cloud import bigquery
import warnings
warnings.filterwarnings('ignore')

class GeographicFireEventClustering:
    def __init__(self, project_id, dataset_id):
        self.client = bigquery.Client(project=project_id)
        self.dataset_id = dataset_id
        self.project_id = project_id

    def haversine_distance(self, lat1, lon1, lat2, lon2):
        """Calculate haversine distance between two points in kilometers"""
        R = 6371
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        return R * c

    def define_geographic_regions(self):
        """Define non-overlapping geographic regions to process separately"""
        regions = {
            'Pacific_West': {'lat_min': 32, 'lat_max': 49, 'lon_min': -125, 'lon_max': -115},
            'Mountain_West': {'lat_min': 32, 'lat_max': 49, 'lon_min': -115, 'lon_max': -105},
            'Great_Plains': {'lat_min': 32, 'lat_max': 49, 'lon_min': -105, 'lon_max': -95},
            'South_Texas': {'lat_min': 25, 'lat_max': 32, 'lon_min': -105, 'lon_max': -95},
            'South_Central': {'lat_min': 25, 'lat_max': 37, 'lon_min': -95, 'lon_max': -85},
            'Midwest': {'lat_min': 37, 'lat_max': 49, 'lon_min': -95, 'lon_max': -85},
            'Southeast': {'lat_min': 25, 'lat_max': 37, 'lon_min': -85, 'lon_max': -75},
            'Northeast': {'lat_min': 37, 'lat_max': 49, 'lon_min': -85, 'lon_max': -67},
            'Florida': {'lat_min': 25, 'lat_max': 32, 'lon_min': -85, 'lon_max': -80}
        }
        return regions

    def get_region_parameters(self, region_name):
        """Get optimized parameters for each region based on fire patterns"""
        region_params = {
            'Pacific_West': {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5},
            'Mountain_West': {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5},
            'Great_Plains': {'spatial_eps': 0.003, 'temporal_days': 2, 'min_samples': 5},
            'South_Texas': {'spatial_eps': 0.003, 'temporal_days': 2, 'min_samples': 5},
            'South_Central': {'spatial_eps': 0.003, 'temporal_days': 2, 'min_samples': 5},
            'Midwest': {'spatial_eps': 0.003, 'temporal_days': 2, 'min_samples': 5},
            'Southeast': {'spatial_eps': 0.002, 'temporal_days': 1, 'min_samples': 5},
            'Northeast': {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5},
            'Florida': {'spatial_eps': 0.002, 'temporal_days': 1, 'min_samples': 5}
        }
        return region_params.get(region_name, {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5})

    def process_geographic_region(self, year, region_name, region_bounds):
        """Process fire events for a specific geographic region with region-optimized parameters"""

        params = self.get_region_parameters(region_name)
        spatial_eps = params['spatial_eps']
        temporal_days = params['temporal_days']
        min_samples = params['min_samples']

        table_name = f"emission_{year}"

        print(f"\nProcessing region: {region_name}")
        print(f"   Bounds: {region_bounds}")
        print(f"   Parameters: eps={spatial_eps}° (~{spatial_eps*111:.1f}km), temporal={temporal_days}d, min_samples={min_samples}")

        query = f"""
        SELECT
            id, year, doy, longitude, latitude, fire_date,
            grid10k, covertype, fuelcode, area_burned,
            consumed_fuel, ECO2, burn_source, burnday_source
        FROM `{self.project_id}.{self.dataset_id}.{table_name}`
        WHERE longitude IS NOT NULL
        AND latitude IS NOT NULL
        AND fire_date IS NOT NULL
        AND latitude >= {region_bounds['lat_min']}
        AND latitude < {region_bounds['lat_max']}
        AND longitude >= {region_bounds['lon_min']}
        AND longitude < {region_bounds['lon_max']}
        ORDER BY fire_date, longitude, latitude
        """

        df = self.client.query(query).to_dataframe()

        if len(df) == 0:
            print(f"   No data found for {region_name}")
            return pd.DataFrame()

        print(f"   Found {len(df):,} records")

        df['fire_date'] = pd.to_datetime(df['fire_date'])

        clustered_df = self._cluster_region_data(df, spatial_eps, temporal_days, min_samples)

        if len(clustered_df) > 0:
            clustered_df['region'] = region_name
            unique_events = len(clustered_df['fire_event_id'].unique())
            print(f"   Found {unique_events} fire events in {region_name}")
            return clustered_df
        else:
            print(f"   No fire events found in {region_name}")
            return pd.DataFrame()

    def _cluster_region_data(self, df, spatial_eps, temporal_days, min_samples):
        """Apply DBSCAN clustering to data from a single geographic region"""

        coords = df[['longitude', 'latitude']].values
        coords_rad = np.radians(coords)

        dbscan = DBSCAN(eps=spatial_eps, min_samples=min_samples, metric='haversine')
        spatial_labels = dbscan.fit_predict(coords_rad)
        df['spatial_cluster'] = spatial_labels

        fire_events = []
        event_id = 0

        for spatial_cluster in df['spatial_cluster'].unique():
            if spatial_cluster == -1:
                continue

            cluster_data = df[df['spatial_cluster'] == spatial_cluster].copy()
            cluster_data = cluster_data.sort_values('fire_date')

            cluster_data['temp_group'] = 0
            current_group = 0

            for i in range(1, len(cluster_data)):
                prev_date = cluster_data.iloc[i-1]['fire_date']
                curr_date = cluster_data.iloc[i]['fire_date']

                if (curr_date - prev_date).days > temporal_days:
                    current_group += 1

                cluster_data.iloc[i, cluster_data.columns.get_loc('temp_group')] = current_group

            for temp_group in cluster_data['temp_group'].unique():
                group_data = cluster_data[cluster_data['temp_group'] == temp_group].copy()
                group_data['fire_event_id'] = event_id
                fire_events.append(group_data)
                event_id += 1

        if fire_events:
            result_df = pd.concat(fire_events, ignore_index=True)
            result_df = result_df.drop(['spatial_cluster', 'temp_group'], axis=1)
            return result_df
        else:
            return pd.DataFrame()

    def process_year_geographic(self, year):
        """Process entire year using geographic chunking approach with region-specific parameters"""

        print(f"GEOGRAPHIC FIRE CLUSTERING for {year}")
        print("Using region-specific optimized parameters:")

        regions = self.define_geographic_regions()

        for region_name in regions.keys():
            params = self.get_region_parameters(region_name)
            print(f"  {region_name}: eps={params['spatial_eps']}° (~{params['spatial_eps']*111:.1f}km), temporal={params['temporal_days']}d")

        print("=" * 80)
        total_query = f"""
        SELECT COUNT(*) as total_count
        FROM `{self.project_id}.{self.dataset_id}.emission_{year}`
        WHERE longitude IS NOT NULL
        AND latitude IS NOT NULL
        AND fire_date IS NOT NULL
        """

        total_result = self.client.query(total_query).to_dataframe()
        original_count = total_result['total_count'].iloc[0]

        all_regional_events = []
        global_event_id = 0

        for region_name, region_bounds in regions.items():
            regional_events = self.process_geographic_region(
                year, region_name, region_bounds
            )

            if len(regional_events) > 0:
                max_regional_id = regional_events['fire_event_id'].max()
                regional_events['fire_event_id'] += global_event_id
                global_event_id += max_regional_id + 1

                all_regional_events.append(regional_events)

        if all_regional_events:
            combined_events = pd.concat(all_regional_events, ignore_index=True)

            print(f"\nFINAL RESULTS:")
            print(f"Total fire events: {len(combined_events['fire_event_id'].unique())}")
            print(f"Total data points: {len(combined_events):,}")
            print(f"Original data points: {original_count:,}")
            print(f"Coverage: {len(combined_events)/original_count*100:.1f}% of original data")

            return combined_events
        else:
            return pd.DataFrame()

    def analyze_fire_events(self, fire_events_df):
        """Analyze the identified fire events"""
        if len(fire_events_df) == 0:
            return pd.DataFrame()

        event_stats = fire_events_df.groupby('fire_event_id').agg({
            'id': 'count',
            'fire_date': ['min', 'max'],
            'longitude': ['min', 'max'],
            'latitude': ['min', 'max'],
            'area_burned': 'sum',
            'consumed_fuel': 'sum',
            'ECO2': 'sum',
            'region': 'first'
        }).reset_index()

        event_stats.columns = ['fire_event_id', 'num_points', 'start_date', 'end_date',
                              'min_lon', 'max_lon', 'min_lat', 'max_lat',
                              'total_area_burned', 'total_consumed_fuel', 'total_ECO2', 'region']

        event_stats['duration_days'] = (event_stats['end_date'] - event_stats['start_date']).dt.days + 1
        event_stats['spatial_extent_km'] = event_stats.apply(
            lambda row: self.haversine_distance(row['min_lat'], row['min_lon'],
                                              row['max_lat'], row['max_lon']), axis=1
        )

        return event_stats

    def save_results_to_csv(self, fire_events_df, filename):
        """Save results to CSV file"""
        fire_events_df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")

def main():
    print("GEOGRAPHIC FIRE CLUSTERING)
    print("This approach processes data by geographic regions to prevent")
    print("continent-spanning 'mega-fires' caused by temporal chunking.")
    print()

    clustering = GeographicFireEventClustering(
        project_id="code-for-planet",
        dataset_id="emission_db"
    )

    year = 2004
    fire_events = clustering.process_year_geographic(year=year)

    if len(fire_events) > 0:
        stats = clustering.analyze_fire_events(fire_events)

        print(f"\nGEOGRAPHIC CLUSTERING RESULTS:")
        print(f"Fire events found: {len(stats)}")
        print(f"Average duration: {stats['duration_days'].mean():.1f} days")
        print(f"Average spatial extent: {stats['spatial_extent_km'].mean():.1f} km")
        print(f"Max spatial extent: {stats['spatial_extent_km'].max():.1f} km")

        large_events = stats[stats['spatial_extent_km'] > 500]
        very_large_events = stats[stats['spatial_extent_km'] > 1000]

        print(f"\nQUALITY CHECK:")
        print(f"Events >500km: {len(large_events)} (target: <5)")
        print(f"Events >1000km: {len(very_large_events)} (target: 0)")

        if len(large_events) > 0:
            print(f"\nRemaining large events by region:")
            large_by_region = large_events.groupby('region')['fire_event_id'].count()
            print(large_by_region)

        clustering.save_results_to_csv(fire_events, f"fire_events_{year}_final.csv")
        clustering.save_results_to_csv(stats, f"fire_event_stats_{year}_final.csv")

        print(f"\nResults saved:")
        print(f"- fire_events_{year}_final.csv")
        print(f"- fire_event_stats_{year}_final.csv")

        print(f"\nResults by region:")
        region_summary = stats.groupby('region').agg({
            'fire_event_id': 'count',
            'spatial_extent_km': 'mean',
            'duration_days': 'mean'
        }).round(1)
        print(region_summary)

    else:
        print("No fire events found")

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from google.cloud import bigquery
import warnings
warnings.filterwarnings('ignore')

class GeographicFireEventClustering:
    def __init__(self, project_id, dataset_id):
        self.client = bigquery.Client(project=project_id)
        self.dataset_id = dataset_id
        self.project_id = project_id

    def haversine_distance(self, lat1, lon1, lat2, lon2):
        """Calculate haversine distance between two points in kilometers"""
        R = 6371
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        return R * c

    def define_geographic_regions(self):
        """Define strictly non-overlapping geographic regions with clear boundaries"""
        regions = {
            'Pacific_West': {'lat_min': 32.0, 'lat_max': 49.0, 'lon_min': -125.0, 'lon_max': -115.0},
            'Mountain_West': {'lat_min': 32.0, 'lat_max': 49.0, 'lon_min': -115.0, 'lon_max': -105.0},
            'Great_Plains': {'lat_min': 32.0, 'lat_max': 49.0, 'lon_min': -105.0, 'lon_max': -95.0},
            'South_Texas': {'lat_min': 25.0, 'lat_max': 32.0, 'lon_min': -105.0, 'lon_max': -95.0},
            'South_Central': {'lat_min': 25.0, 'lat_max': 37.0, 'lon_min': -95.0, 'lon_max': -85.0},
            'Midwest': {'lat_min': 37.0, 'lat_max': 49.0, 'lon_min': -95.0, 'lon_max': -85.0},
            'Southeast': {'lat_min': 25.0, 'lat_max': 37.0, 'lon_min': -85.0, 'lon_max': -75.0},
            'Northeast': {'lat_min': 37.0, 'lat_max': 49.0, 'lon_min': -85.0, 'lon_max': -67.0},
            'Florida': {'lat_min': 25.0, 'lat_max': 32.0, 'lon_min': -85.0, 'lon_max': -80.0}
        }
        return regions

    def get_region_parameters(self, region_name):
        """Get optimized parameters for each region based on fire patterns"""
        region_params = {
            'Pacific_West': {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5},
            'Mountain_West': {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5},
            'Great_Plains': {'spatial_eps': 0.003, 'temporal_days': 2, 'min_samples': 5},
            'South_Texas': {'spatial_eps': 0.003, 'temporal_days': 2, 'min_samples': 5},
            'South_Central': {'spatial_eps': 0.002, 'temporal_days': 1, 'min_samples': 5},  # tighter
            'Midwest': {'spatial_eps': 0.003, 'temporal_days': 2, 'min_samples': 5},
            'Southeast': {'spatial_eps': 0.002, 'temporal_days': 1, 'min_samples': 5},
            'Northeast': {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5},
            'Florida': {'spatial_eps': 0.002, 'temporal_days': 1, 'min_samples': 5}
        }
        return region_params.get(region_name, {'spatial_eps': 0.005, 'temporal_days': 3, 'min_samples': 5})

    def process_geographic_region(self, year, region_name, region_bounds):
        """Process fire events for a specific geographic region with region-optimized parameters"""

        params = self.get_region_parameters(region_name)
        spatial_eps = params['spatial_eps']
        temporal_days = params['temporal_days']
        min_samples = params['min_samples']

        table_name = f"emission_{year}"

        print(f"\nProcessing region: {region_name}")
        print(f"   Bounds: {region_bounds}")
        print(f"   Parameters: eps={spatial_eps}° (~{spatial_eps*111:.1f}km), temporal={temporal_days}d, min_samples={min_samples}")

        query = f"""
        SELECT
            id, year, doy, longitude, latitude, fire_date,
            grid10k, covertype, fuelcode, area_burned,
            consumed_fuel, ECO2, burn_source, burnday_source
        FROM `{self.project_id}.{self.dataset_id}.{table_name}`
        WHERE longitude IS NOT NULL
        AND latitude IS NOT NULL
        AND fire_date IS NOT NULL
        AND latitude >= {region_bounds['lat_min']}
        AND latitude < {region_bounds['lat_max']}
        AND longitude >= {region_bounds['lon_min']}
        AND longitude < {region_bounds['lon_max']}
        ORDER BY fire_date, longitude, latitude
        """

        df = self.client.query(query).to_dataframe()

        if len(df) == 0:
            print(f"   No data found for {region_name}")
            return pd.DataFrame()

        print(f"   Found {len(df):,} records")

        df['fire_date'] = pd.to_datetime(df['fire_date'])

        clustered_df = self._cluster_region_data(df, spatial_eps, temporal_days, min_samples)

        if len(clustered_df) > 0:
            clustered_df['region'] = region_name
            unique_events = len(clustered_df['fire_event_id'].unique())
            print(f"   Found {unique_events} fire events in {region_name}")
            return clustered_df
        else:
            print(f"   No fire events found in {region_name}")
            return pd.DataFrame()

    def _cluster_region_data(self, df, spatial_eps, temporal_days, min_samples):
        """Apply DBSCAN clustering to data from a single geographic region"""

        coords = df[['longitude', 'latitude']].values
        coords_rad = np.radians(coords)

        dbscan = DBSCAN(eps=spatial_eps, min_samples=min_samples, metric='haversine')
        spatial_labels = dbscan.fit_predict(coords_rad)
        df['spatial_cluster'] = spatial_labels

        fire_events = []
        event_id = 0

        for spatial_cluster in df['spatial_cluster'].unique():
            if spatial_cluster == -1:
                continue

            cluster_data = df[df['spatial_cluster'] == spatial_cluster].copy()
            cluster_data = cluster_data.sort_values('fire_date')

            cluster_data['temp_group'] = 0
            current_group = 0

            for i in range(1, len(cluster_data)):
                prev_date = cluster_data.iloc[i-1]['fire_date']
                curr_date = cluster_data.iloc[i]['fire_date']

                if (curr_date - prev_date).days > temporal_days:
                    current_group += 1

                cluster_data.iloc[i, cluster_data.columns.get_loc('temp_group')] = current_group

            for temp_group in cluster_data['temp_group'].unique():
                group_data = cluster_data[cluster_data['temp_group'] == temp_group].copy()
                group_data['fire_event_id'] = event_id
                fire_events.append(group_data)
                event_id += 1

        if fire_events:
            result_df = pd.concat(fire_events, ignore_index=True)
            result_df = result_df.drop(['spatial_cluster', 'temp_group'], axis=1)
            return result_df
        else:
            return pd.DataFrame()

    def process_year_geographic(self, year):
        """Process entire year using geographic chunking approach with region-specific parameters"""

        print(f"GEOGRAPHIC FIRE CLUSTERING for {year}")
        print("Using region-specific optimized parameters:")

        regions = self.define_geographic_regions()

        for region_name in regions.keys():
            params = self.get_region_parameters(region_name)
            print(f"  {region_name}: eps={params['spatial_eps']}° (~{params['spatial_eps']*111:.1f}km), temporal={params['temporal_days']}d")

        print("=" * 80)

        total_query = f"""
        SELECT COUNT(*) as total_count
        FROM `{self.project_id}.{self.dataset_id}.emission_{year}`
        WHERE longitude IS NOT NULL
        AND latitude IS NOT NULL
        AND fire_date IS NOT NULL
        """

        total_result = self.client.query(total_query).to_dataframe()
        original_count = total_result['total_count'].iloc[0]

        all_regional_events = []
        global_event_id = 0

        for region_name, region_bounds in regions.items():
            regional_events = self.process_geographic_region(
                year, region_name, region_bounds
            )

            if len(regional_events) > 0:
                max_regional_id = regional_events['fire_event_id'].max()
                regional_events['fire_event_id'] += global_event_id
                global_event_id += max_regional_id + 1

                all_regional_events.append(regional_events)

        if all_regional_events:
            combined_events = pd.concat(all_regional_events, ignore_index=True)

            initial_count = len(combined_events)
            combined_events = combined_events.drop_duplicates(subset=['id'], keep='first')
            final_count = len(combined_events)

            if initial_count != final_count:
                print(f"WARNING: Removed {initial_count - final_count} duplicate records")

            print(f"\nFINAL RESULTS:")
            print(f"Total fire events: {len(combined_events['fire_event_id'].unique())}")
            print(f"Total data points: {len(combined_events):,}")
            print(f"Original data points: {original_count:,}")
            print(f"Coverage: {len(combined_events)/original_count*100:.1f}% of original data")

            if len(combined_events) > original_count:
                print(f"WARNING: More data points than original - investigating...")

                print("Regional data point totals:")
                for events in all_regional_events:
                    region = events['region'].iloc[0]
                    print(f"  {region}: {len(events):,} points")

            return combined_events
        else:
            return pd.DataFrame()

    def analyze_fire_events(self, fire_events_df):
        """Analyze the identified fire events"""
        if len(fire_events_df) == 0:
            return pd.DataFrame()

        event_stats = fire_events_df.groupby('fire_event_id').agg({
            'id': 'count',
            'fire_date': ['min', 'max'],
            'longitude': ['min', 'max'],
            'latitude': ['min', 'max'],
            'area_burned': 'sum',
            'consumed_fuel': 'sum',
            'ECO2': 'sum',
            'region': 'first'
        }).reset_index()

        event_stats.columns = ['fire_event_id', 'num_points', 'start_date', 'end_date',
                              'min_lon', 'max_lon', 'min_lat', 'max_lat',
                              'total_area_burned', 'total_consumed_fuel', 'total_ECO2', 'region']

        event_stats['duration_days'] = (event_stats['end_date'] - event_stats['start_date']).dt.days + 1
        event_stats['spatial_extent_km'] = event_stats.apply(
            lambda row: self.haversine_distance(row['min_lat'], row['min_lon'],
                                              row['max_lat'], row['max_lon']), axis=1
        )

        return event_stats

    def save_results_to_csv(self, fire_events_df, filename):
        """Save results to CSV file"""
        fire_events_df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")


def main():
    print("GEOGRAPHIC FIRE CLUSTERING)
    print("This approach processes data by geographic regions to prevent")
    print("continent-spanning 'mega-fires' caused by temporal chunking.")
    print()

    clustering = GeographicFireEventClustering(
        project_id="code-for-planet",
        dataset_id="emission_db"
    )

    year = 2004
    fire_events = clustering.process_year_geographic(year=year)

    if len(fire_events) > 0:
        stats = clustering.analyze_fire_events(fire_events)

        print(f"\nGEOGRAPHIC CLUSTERING RESULTS:")
        print(f"Fire events found: {len(stats)}")
        print(f"Average duration: {stats['duration_days'].mean():.1f} days")
        print(f"Average spatial extent: {stats['spatial_extent_km'].mean():.1f} km")
        print(f"Max spatial extent: {stats['spatial_extent_km'].max():.1f} km")

        large_events = stats[stats['spatial_extent_km'] > 500]
        very_large_events = stats[stats['spatial_extent_km'] > 1000]

        print(f"\nQUALITY CHECK:")
        print(f"Events >500km: {len(large_events)} (target: <5)")
        print(f"Events >1000km: {len(very_large_events)} (target: 0)")

        if len(large_events) > 0:
            print(f"\nRemaining large events by region:")
            large_by_region = large_events.groupby('region')['fire_event_id'].count()
            print(large_by_region)

        clustering.save_results_to_csv(fire_events, f"fire_events_{year}_final.csv")
        clustering.save_results_to_csv(stats, f"fire_event_stats_{year}_final.csv")

        print(f"\nResults saved:")
        print(f"- fire_events_{year}_final.csv")
        print(f"- fire_event_stats_{year}_final.csv")

        print(f"\nResults by region:")
        region_summary = stats.groupby('region').agg({
            'fire_event_id': 'count',
            'spatial_extent_km': 'mean',
            'duration_days': 'mean'
        }).round(1)
        print(region_summary)

    else:
        print("No fire events found")

if __name__ == "__main__":
    main()