In [None]:
# Install libraries for GCS access, NetCDF files, and data handling
!pip install gcsfs netcdf4 pandas xarray scipy

Collecting netcdf4
  Downloading netCDF4-1.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (1.8 kB)
Collecting cftime (from netcdf4)
  Downloading cftime-1.6.4.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.7 kB)
Downloading netCDF4-1.7.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (9.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.3/9.3 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cftime-1.6.4.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.4/1.4 MB[0m [31m74.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: cftime, netcdf4
Successfully installed cftime-1.6.4.post1 netcdf4-1.7.2


In [None]:
# Cell 1: Install all necessary libraries
!pip install google-cloud-storage gcsfs netcdf4 pandas xarray scipy



In [None]:
import pandas as pd
import numpy as np
import xarray as xr
import os
from google.cloud import storage
from scipy.spatial import cKDTree

# --- Set GCS Bucket and Base Path ---
BUCKET_NAME = 'data_housee'
BASE_PATH = f'gs://{BUCKET_NAME}/'

# ==============================================================================
# 1. LOAD ALL THREE DATA SOURCES FROM GOOGLE CLOUD STORAGE
# ==============================================================================
print("Step 1: Loading all three data sources from GCS...")

# --- Source 1: BASE fire data ---
fire_files = [f"{BASE_PATH}emission_data/emissions_{year}.csv" for year in range(2003, 2016)]
fire_df = pd.concat((pd.read_csv(f) for f in fire_files), ignore_index=True)
fire_df = fire_df[fire_df['doy'] >= 1].copy()
fire_df['date'] = pd.to_datetime(fire_df['year'].astype(str) + '-' + fire_df['doy'].astype(str), format='%Y-%j')
print(f"Loaded {len(fire_df)} fire records.")

# --- Source 2: WIND DIRECTION data (our target) ---
wind_dir_files = [f"{BASE_PATH}emission_data/Wind Direction/wind_direction_{year}.csv" for year in range(2003, 2016)]
wind_dir_df = pd.concat((pd.read_csv(f) for f in wind_dir_files), ignore_index=True)
wind_dir_df['date'] = pd.to_datetime(wind_dir_df['date'], format='%Y%m%d')
print(f"Loaded {len(wind_dir_df)} wind direction records.")

# --- Source 3: WIND SPEED data (our key predictor) ---
# This section is now updated to download files first.
print("\nDownloading Wind Speed NetCDF files from GCS...")
storage_client = storage.Client()
bucket = storage_client.bucket(BUCKET_NAME)
local_wind_speed_files = []

for year in range(2003, 2016):
   # Define remote path in GCS and local path in the notebook environment
   gcs_path = f"Weather_data/vs_{year}.nc"
   local_path = f"/tmp/vs_{year}.nc"

   # Download the file
   blob = bucket.blob(gcs_path)
   blob.download_to_filename(local_path)
   local_wind_speed_files.append(local_path)
   print(f"Downloaded {gcs_path} to {local_path}")

# Now, open the LOCAL files with xarray
print("\nLoading downloaded NetCDF files into xarray...")
wind_speed_ds = xr.open_mfdataset(local_wind_speed_files, combine='by_coords', engine='netcdf4')
wind_speed_ds['time'] = pd.to_datetime(wind_speed_ds['day'].values)
print("Loaded and combined all wind speed NetCDF files.")


# ==============================================================================
# 2. MERGE THE DATASETS (Optimized for Speed)
# ==============================================================================
print("\nStep 2: Merging data...")

# --- First Merge: Fire Data + Wind Direction ---
print("Performing first merge: Fire Data + Wind Direction...")
merged_data_1 = []
for date, group in fire_df.groupby('date'):
   daily_wind_dir_df = wind_dir_df[wind_dir_df['date'] == date]
   if daily_wind_dir_df.empty: continue

   daily_tree = cKDTree(daily_wind_dir_df[['latitude', 'longitude']])
   distances, indices = daily_tree.query(group[['latitude', 'longitude']], k=1)

   matched_wind_dir = daily_wind_dir_df.iloc[indices].reset_index(drop=True)
   group = group.reset_index(drop=True)

   merged_chunk = pd.concat([group, matched_wind_dir.add_suffix('_dir')], axis=1)
   merged_data_1.append(merged_chunk)

if not merged_data_1:
   raise ValueError("First merge (fire + wind direction) resulted in an empty DataFrame.")

df_with_direction = pd.concat(merged_data_1, ignore_index=True)
print(f"Merge 1 complete. Result has {len(df_with_direction)} records.")

# --- Second Merge: Add Wind Speed (Vectorized for High Performance) ---
print("\nPerforming second merge: Adding Wind Speed (optimized method)...")
locations_lat = xr.DataArray(df_with_direction['latitude'].values, dims="event")
locations_lon = xr.DataArray(df_with_direction['longitude'].values, dims="event")
times = xr.DataArray(df_with_direction['date'].values, dims="event")

speed_points = wind_speed_ds.sel(
   lat=locations_lat,
   lon=locations_lon,
   time=times,
   method='nearest'
)
df_with_direction['wind_speed'] = speed_points['wind_speed'].values

# --- Final Cleanup ---
final_df = df_with_direction.dropna(subset=['WD10M_dir', 'wind_speed']).copy()
print(f"Merge 2 complete. Final dataset has {len(final_df)} records.")


# ==============================================================================
# 3. PREVIEW THE FINAL, COMPLETE DATASET
# ==============================================================================
print("\nPreview of the final, complete merged data:")
print(final_df[['date', 'latitude', 'longitude', 'covertype', 'fuel_moisture_class', 'WD10M_dir', 'wind_speed']].head())

Step 1: Loading all three data sources from GCS...
Loaded 7254509 fire records.
Loaded 7283432 wind direction records.

Downloading Wind Speed NetCDF files from GCS...
Downloaded Weather_data/vs_2003.nc to /tmp/vs_2003.nc
Downloaded Weather_data/vs_2004.nc to /tmp/vs_2004.nc
Downloaded Weather_data/vs_2005.nc to /tmp/vs_2005.nc
Downloaded Weather_data/vs_2006.nc to /tmp/vs_2006.nc
Downloaded Weather_data/vs_2007.nc to /tmp/vs_2007.nc
Downloaded Weather_data/vs_2008.nc to /tmp/vs_2008.nc
Downloaded Weather_data/vs_2009.nc to /tmp/vs_2009.nc
Downloaded Weather_data/vs_2010.nc to /tmp/vs_2010.nc
Downloaded Weather_data/vs_2011.nc to /tmp/vs_2011.nc
Downloaded Weather_data/vs_2012.nc to /tmp/vs_2012.nc
Downloaded Weather_data/vs_2013.nc to /tmp/vs_2013.nc
Downloaded Weather_data/vs_2014.nc to /tmp/vs_2014.nc
Downloaded Weather_data/vs_2015.nc to /tmp/vs_2015.nc

Loading downloaded NetCDF files into xarray...
Loaded and combined all wind speed NetCDF files.

Step 2: Merging data...
Performi