Initial Fire Identification (Attempt 1) - Gathering info and attempting indentification

In [None]:
from google.cloud import bigquery

def check_table_schema():
    client = bigquery.Client(project="code-for-planet")

    # table schema
    table_ref = client.dataset("emission_db").table("emission_2004")
    table = client.get_table(table_ref)

    print("=== Table Schema ===")
    print(f"Table: {table.full_table_id}")
    print(f"Number of rows: {table.num_rows:,}")
    print(f"Number of columns: {len(table.schema)}")
    print()

    print("=== Column Names and Types ===")
    for field in table.schema:
        print(f"{field.name}: {field.field_type}")
    print()

    # sample data
    query = f"""
    SELECT *
    FROM `code-for-planet.emission_db.emission_2004`
    LIMIT 3
    """

    print("=== Sample Data (first 3 rows) ===")
    df = client.query(query).to_dataframe()
    print(df.head())
    print()

    # check key columns we need
    required_cols = ['longitude', 'latitude', 'fire_date', 'year', 'doy']
    available_cols = [field.name for field in table.schema]

    print("=== Required Columns Status ===")
    for col in required_cols:
        if col in available_cols:
            print(f"{col}: Available")
        else:
            print(f"✗ {col}: Missing")
            # anything sim
            similar = [c for c in available_cols if col.lower() in c.lower() or c.lower() in col.lower()]
            if similar:
                print(f"  Similar columns: {similar}")

if __name__ == "__main__":
    check_table_schema()

=== Table Schema ===
Table: code-for-planet:emission_db.emission_2004
Number of rows: 239,663
Number of columns: 37

=== Column Names and Types ===
id: INTEGER
year: INTEGER
doy: INTEGER
longitude: FLOAT
latitude: FLOAT
grid10k: INTEGER
covertype: INTEGER
fuelcode: INTEGER
area_burned: FLOAT
prefire_fuel: FLOAT
consumed_fuel: FLOAT
ECO2: FLOAT
ECO: FLOAT
ECH4: FLOAT
EPM2_5: FLOAT
cwd_frac: FLOAT
duff_frac: FLOAT
fuel_moisture_class: INTEGER
burn_source: INTEGER
burnday_source: INTEGER
BSEV: INTEGER
BSEV_flag: INTEGER
fire_date: DATE
bi_value: FLOAT
fm100_value: FLOAT
pet_value: FLOAT
fm1000_value: FLOAT
pr_value: FLOAT
rmax_value: FLOAT
rmin_value: FLOAT
sph_value: FLOAT
srad_value: FLOAT
tmmn_value: FLOAT
th_value: FLOAT
tmmx_value: FLOAT
vpd_value: FLOAT
vs_value: FLOAT

=== Sample Data (first 3 rows) ===
      id  year  doy  longitude  latitude  grid10k  covertype  fuelcode  \
0  91593  2004    0  -113.3554   36.5108    62778          1         1   
1  91594  2004    0  -113.3559   

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

class FireEventClustering:
    def __init__(self, project_id, dataset_id):
        self.client = bigquery.Client(project=project_id)
        self.dataset_id = dataset_id
        self.project_id = project_id

    def haversine_distance(self, lat1, lon1, lat2, lon2):
        """Calculate haversine distance between two points in kilometers"""
        R = 6371  # earths radius in kms

        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1

        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))

        return R * c

    def process_year_chunked(self, year, spatial_eps=0.02, temporal_days=7, min_samples=3, chunk_size=10000):
        """
        Process fire events for a single year in chunks to manage memory

        Parameters:
        - year: Year to process
        - spatial_eps: Spatial epsilon in degrees (~2.2km at equator)
        - temporal_days: Temporal window for grouping (days)
        - min_samples: Minimum samples for DBSCAN cluster
        - chunk_size: Number of rows to process at once
        """

        # build table name based on year
        table_name = f"emission_{year}"

        # total count & date range
        count_query = f"""
        SELECT
            COUNT(*) as total_rows,
            MIN(fire_date) as min_date,
            MAX(fire_date) as max_date
        FROM `{self.project_id}.{self.dataset_id}.{table_name}`
        WHERE longitude IS NOT NULL
        AND latitude IS NOT NULL
        AND fire_date IS NOT NULL
        """

        print(f"Getting data overview for {year}...")
        overview = self.client.query(count_query).to_dataframe()
        total_rows = overview['total_rows'].iloc[0]
        min_date = pd.Timestamp(overview['min_date'].iloc[0])
        max_date = pd.Timestamp(overview['max_date'].iloc[0])

        print(f"Total valid rows: {total_rows:,}")
        print(f"Date range: {min_date.date()} to {max_date.date()}")

        if total_rows == 0:
            return pd.DataFrame()

        # process temporal chunks (month by month)
        all_fire_events = []
        event_id_offset = 0

        # date ranges for processing
        current_date = min_date

        while current_date <= max_date:
            # one month at a time
            next_date = current_date + pd.DateOffset(months=1)
            if next_date > max_date:
                next_date = max_date + pd.DateOffset(days=1)

            print(f"\nProcessing period: {current_date.date()} to {(next_date - pd.DateOffset(days=1)).date()}")

            # query for this time period
            chunk_query = f"""
            SELECT
                id,
                year,
                doy,
                longitude,
                latitude,
                fire_date,
                grid10k,
                covertype,
                fuelcode,
                area_burned,
                consumed_fuel,
                ECO2,
                burn_source,
                burnday_source
            FROM `{self.project_id}.{self.dataset_id}.{table_name}`
            WHERE longitude IS NOT NULL
            AND latitude IS NOT NULL
            AND fire_date IS NOT NULL
            AND fire_date >= '{current_date.date()}'
            AND fire_date < '{next_date.date()}'
            ORDER BY fire_date, longitude, latitude
            """

            chunk_df = self.client.query(chunk_query).to_dataframe()

            if len(chunk_df) == 0:
                current_date = next_date
                continue

            print(f"  Found {len(chunk_df)} records")

            # Convert fire_date to datetime
            chunk_df['fire_date'] = pd.to_datetime(chunk_df['fire_date'])
            chunk_df['day_of_year'] = chunk_df['fire_date'].dt.dayofyear

            # chunk
            try:
                # clustering method
                clustering_method = "fixed"

                if clustering_method == "fixed":
                    spatial_clusters = self._fixed_spatial_clustering(chunk_df, spatial_eps, min_samples)

                chunk_fire_events = self._temporal_refinement(spatial_clusters, temporal_days)

                # fire event IDs to be globally unique
                if len(chunk_fire_events) > 0:
                    max_event_id = chunk_fire_events['fire_event_id'].max()
                    chunk_fire_events['fire_event_id'] += event_id_offset
                    event_id_offset += max_event_id + 1

                    all_fire_events.append(chunk_fire_events)
                    print(f"  Found {len(chunk_fire_events['fire_event_id'].unique())} fire events in this chunk")

            except Exception as e:
                print(f"  Error processing chunk: {e}")
                continue

            current_date = next_date

        # combine all chunks
        if all_fire_events:
            combined_events = pd.concat(all_fire_events, ignore_index=True)
            print(f"\nTotal fire events found: {len(combined_events['fire_event_id'].unique())}")
            return combined_events
        else:
            return pd.DataFrame()
        """
        Process fire events for a single year

        Parameters:
        - year: Year to process
        - spatial_eps: Spatial epsilon in degrees (~2.2km at equator)
        - temporal_days: Temporal window for grouping (days)
        - min_samples: Minimum samples for DBSCAN cluster
        """

        # build table name based on year
        table_name = f"emission_{year}"

        # query data for the year
        query = f"""
        SELECT
            id,
            year,
            doy,
            longitude,
            latitude,
            fire_date,
            grid10k,
            covertype,
            fuelcode,
            area_burned,
            consumed_fuel,
            ECO2,
            burn_source,
            burnday_source
        FROM `{self.project_id}.{self.dataset_id}.{table_name}`
        WHERE longitude IS NOT NULL
        AND latitude IS NOT NULL
        AND fire_date IS NOT NULL
        ORDER BY fire_date, longitude, latitude
        """

        print(f"Processing year {year} from table {table_name}...")
        df = self.client.query(query).to_dataframe()

        if len(df) == 0:
            print(f"No data found for year {year}")
            return pd.DataFrame()

        print(f"Found {len(df)} records for year {year}")

        # convert fire_date to datetime
        df['fire_date'] = pd.to_datetime(df['fire_date'])

        # add temporal for clustering
        df['day_of_year'] = df['fire_date'].dt.dayofyear

        # clustering method
        clustering_method = "adaptive"

        if clustering_method == "adaptive":
            spatial_clusters = self.adaptive_spatial_clustering(df, min_samples)
        elif clustering_method == "density":
            spatial_clusters = self.density_based_clustering(df, min_samples)
        else:
            spatial_clusters = self._fixed_spatial_clustering(df, spatial_eps, min_samples)

        fire_events = self._temporal_refinement(spatial_clusters, temporal_days)

        return fire_events

    def _fixed_spatial_clustering(self, df, eps, min_samples):
        """Apply DBSCAN clustering with fixed distance threshold"""

        # prep spatial features
        coords = df[['longitude', 'latitude']].values

        # DBSCAN
        dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='haversine')

        # Convert to radians for haversine distance
        coords_rad = np.radians(coords)
        spatial_labels = dbscan.fit_predict(coords_rad)

        df['spatial_cluster'] = spatial_labels

        print(f"    Fixed spatial clustering found {len(set(spatial_labels)) - (1 if -1 in spatial_labels else 0)} clusters")
        print(f"    Noise points: {sum(spatial_labels == -1)}")

        return df

    def adaptive_spatial_clustering(self, df, min_samples=3):
        """
        Adaptive clustering that handles fires of different sizes
        Uses multiple distance thresholds and merges appropriately
        """
        from sklearn.cluster import AgglomerativeClustering

        # radians for haversine distance
        coords = df[['longitude', 'latitude']].values
        coords_rad = np.radians(coords)

        # Try multiple distance thresholds
        distance_thresholds = [0.01, 0.02, 0.05, 0.1]  # 1km, 2km, 5km, 10km

        best_clustering = None
        best_score = -1

        for eps in distance_thresholds:
            dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='haversine')
            labels = dbscan.fit_predict(coords_rad)

            # Calc silhouette score (higher is better)
            if len(set(labels)) > 1:
                from sklearn.metrics import silhouette_score
                # non-noise points for scoring
                mask = labels != -1
                if np.sum(mask) > min_samples:
                    score = silhouette_score(coords_rad[mask], labels[mask], metric='haversine')
                    print(f"Distance threshold {eps*111:.1f}km: {len(set(labels))-1} clusters, score: {score:.3f}")

                    if score > best_score:
                        best_score = score
                        best_clustering = labels

        if best_clustering is not None:
            df['spatial_cluster'] = best_clustering
            print(f"Best clustering selected with score: {best_score:.3f}")
        else:
            # fallback to default DBSCAN
            dbscan = DBSCAN(eps=0.02, min_samples=min_samples, metric='haversine')
            df['spatial_cluster'] = dbscan.fit_predict(coords_rad)
            print("Using fallback clustering")

        return df

    def _temporal_refinement(self, df, temporal_days):
        """Refine spatial clusters by temporal proximity"""

        fire_events = []
        event_id = 0

        # process each spatial cluster
        for spatial_cluster in df['spatial_cluster'].unique():
            if spatial_cluster == -1:  # Skip noise points
                continue

            cluster_data = df[df['spatial_cluster'] == spatial_cluster].copy()
            cluster_data = cluster_data.sort_values('fire_date')

            # Group by temporal prox within spatial cluster
            cluster_data['temp_group'] = 0
            current_group = 0

            for i in range(1, len(cluster_data)):
                prev_date = cluster_data.iloc[i-1]['fire_date']
                curr_date = cluster_data.iloc[i]['fire_date']

                if (curr_date - prev_date).days > temporal_days:
                    current_group += 1

                cluster_data.iloc[i, cluster_data.columns.get_loc('temp_group')] = current_group

            # assign fire event IDs
            for temp_group in cluster_data['temp_group'].unique():
                group_data = cluster_data[cluster_data['temp_group'] == temp_group].copy()
                group_data['fire_event_id'] = event_id
                fire_events.append(group_data)
                event_id += 1

        if fire_events:
            result_df = pd.concat(fire_events, ignore_index=True)
            result_df = result_df.drop(['spatial_cluster', 'temp_group'], axis=1)
            return result_df
        else:
            return pd.DataFrame()

    def analyze_fire_events(self, fire_events_df):
        """Analyze the identified fire events"""

        if len(fire_events_df) == 0:
            return {}

        # calc fire event statistics
        event_stats = fire_events_df.groupby('fire_event_id').agg({
            'id': 'count',
            'fire_date': ['min', 'max'],
            'longitude': ['min', 'max'],
            'latitude': ['min', 'max'],
            'area_burned': 'sum',
            'consumed_fuel': 'sum',
            'ECO2': 'sum'
        }).reset_index()

        # flatten column names
        event_stats.columns = ['fire_event_id', 'num_points', 'start_date', 'end_date',
                              'min_lon', 'max_lon', 'min_lat', 'max_lat',
                              'total_area_burned', 'total_consumed_fuel', 'total_ECO2']

        # duration and spatial extent
        event_stats['duration_days'] = (event_stats['end_date'] - event_stats['start_date']).dt.days + 1
        event_stats['spatial_extent_km'] = event_stats.apply(
            lambda row: self.haversine_distance(row['min_lat'], row['min_lon'],
                                              row['max_lat'], row['max_lon']), axis=1
        )

        return event_stats

    def save_results_to_csv(self, fire_events_df, filename):
        """Save results to CSV file instead of BigQuery"""
        fire_events_df.to_csv(filename, index=False)
        print(f"Results saved to {filename}")

    def save_results(self, fire_events_df, output_table_name):
        """Save results back to BigQuery - COMMENTED OUT FOR SAFETY"""
        print("WARNING: This method would write to BigQuery!")
        print("Uncomment the code below only when you're ready to save results")
        print(f"Would save to table: {self.dataset_id}.{output_table_name}")
        # DO NOT UNCOMMENT THIS
        # job_config = bigquery.LoadJobConfig(
        #     write_disposition="WRITE_TRUNCATE",
        #     schema_update_options=[bigquery.SchemaUpdateOption.ALLOW_FIELD_ADDITION]
        # )
        #
        # job = self.client.load_table_from_dataframe(
        #     fire_events_df,
        #     f"{self.dataset_id}.{output_table_name}",
        #     job_config=job_config
        # )
        #
        # job.result()  # Wait for the job to complete
        # print(f"Results saved to {self.dataset_id}.{output_table_name}")

#  2004
def main():
    # init
    clustering = FireEventClustering(
        project_id="code-for-planet",
        dataset_id="emission_db"
    )

    # process 2004
    year = 2004
    fire_events = clustering.process_year_chunked(
        year=year,
        spatial_eps=0.02,  # ~2.2km at equator
        temporal_days=7,   # 7-day temporal window
        min_samples=3,     # Minimum 3 points per cluster
        chunk_size=10000   # Process 10k rows at a time
    )

    if len(fire_events) > 0:
        stats = clustering.analyze_fire_events(fire_events)
        print(f"\nYear {year} - Found {len(stats)} fire events")
        print(f"Average duration: {stats['duration_days'].mean():.1f} days")
        print(f"Average spatial extent: {stats['spatial_extent_km'].mean():.1f} km")
        print(f"Total data points in events: {len(fire_events)}")

        # save to CSV
        clustering.save_results_to_csv(fire_events, f"fire_events_{year}.csv")
        clustering.save_results_to_csv(stats, f"fire_event_stats_{year}.csv")

        # show
        print("\nSample fire events:")
        print(stats[['fire_event_id', 'num_points', 'duration_days', 'spatial_extent_km',
                    'total_area_burned', 'total_ECO2']].head(10))
    else:
        print(f"No fire events found for year {year}")

if __name__ == "__main__":
    main()

Getting data overview for 2004...
Total valid rows: 239,663
Date range: 2003-12-31 to 2004-12-31

Processing period: 2003-12-31 to 2004-01-30
  Found 9778 records
    Fixed spatial clustering found 11 clusters
    Noise points: 4
  Found 13 fire events in this chunk

Processing period: 2004-01-31 to 2004-02-28
  Found 11391 records
    Fixed spatial clustering found 14 clusters
    Noise points: 6
  Found 15 fire events in this chunk

Processing period: 2004-02-29 to 2004-03-28
  Found 46091 records
    Fixed spatial clustering found 14 clusters
    Noise points: 4
  Found 15 fire events in this chunk

Processing period: 2004-03-29 to 2004-04-28
  Found 30759 records
    Fixed spatial clustering found 15 clusters
    Noise points: 11
  Found 19 fire events in this chunk

Processing period: 2004-04-29 to 2004-05-28
  Found 14452 records
    Fixed spatial clustering found 19 clusters
    Noise points: 1
  Found 22 fire events in this chunk

Processing period: 2004-05-29 to 2004-06-28
  F

In [None]:
import os

# cwd
print("Current working directory:")
print(os.getcwd())

# ls
print("\nAll files in current directory:")
for file in os.listdir('.'):
    print(f"  {file}")

# Look for csv
csv_files = [f for f in os.listdir('.') if f.endswith('.csv')]
print(f"\nCSV files found ({len(csv_files)}):")
for csv_file in csv_files:
    print(f"  {csv_file}")
    size = os.path.getsize(csv_file)
    print(f"    Size: {size:,} bytes")

# show
if csv_files:
    print(f"\nPreview of {csv_files[0]}:")
    df = pd.read_csv(csv_files[0])
    print(f"  Shape: {df.shape}")
    print(df.head(3))

Current working directory:
/content

All files in current directory:
  fire_event_stats_2004.csv
  fire_events_2004.csv

CSV files found (2):
  fire_event_stats_2004.csv
    Size: 22,406 bytes
  fire_events_2004.csv
    Size: 21,909,220 bytes

Preview of fire_event_stats_2004.csv:
  Shape: (187, 13)
   fire_event_id  num_points  start_date    end_date   min_lon   max_lon  \
0              0         487  2003-12-31  2004-01-30 -114.0156 -111.1802   
1              1           9  2004-01-01  2004-01-04 -123.6575 -123.6529   
2              2          25  2004-01-01  2004-01-07 -121.8666 -121.8453   

   min_lat  max_lat  total_area_burned  total_consumed_fuel    total_ECO2  \
0  36.5108  40.7771         23750000.0        708644.127100  1.122436e+06   
1  42.1500  42.1642           250000.0          9738.199809  1.633096e+04   
2  42.4236  42.4424          1187500.0         71075.082111  1.104507e+05   

   duration_days  spatial_extent_km  
0             31         534.414109  
1        

In [None]:
from google.cloud import bigquery
import pandas as pd

# init client
client = bigquery.Client(project="code-for-planet")

# load csvs
fire_events = pd.read_csv('fire_events_2004.csv')
fire_stats = pd.read_csv('fire_event_stats_2004.csv')

# upload fire events table
table_id = "code-for-planet.emission_db.fire_events_2004"
job = client.load_table_from_dataframe(fire_events, table_id)
job.result()
print(f"Loaded {len(fire_events)} rows into {table_id}")

# upload fire stats table
table_id = "code-for-planet.emission_db.fire_event_stats_2004"
job = client.load_table_from_dataframe(fire_stats, table_id)
job.result()
print(f"Loaded {len(fire_stats)} rows into {table_id}")

print("Files successfully uploaded to BigQuery!")

Loaded 239612 rows into code-for-planet.emission_db.fire_events_2004
Loaded 187 rows into code-for-planet.emission_db.fire_event_stats_2004
Files successfully uploaded to BigQuery!
