In [1]:
import xarray as xr
import pandas as pd
import numpy as np
import glob
import os

In [2]:
# CONFIGURATION
DATA_DIR = "./"
OUTPUT_CSV = "arugambay_virtual_bouy_data.csv"

# VIRTUAL BUOY LOCATION (Deep water point)
TARGET_LAT = 7.0
TARGET_LON = 82.0

In [3]:
def main():
    print(f" Historical Data FOR Virtual Buoy Point ({TARGET_LAT}, {TARGET_LON})...")
    
    files = sorted(glob.glob(os.path.join(DATA_DIR, "surf_data_*.nc")))
    if not files:
        print(" No .nc files found!")
        return

    all_dfs = []

    for f in files:
        print(f"   Processing {os.path.basename(f)}...")
        try:
            ds = xr.open_dataset(f)
            
            # 1. HANDLE EXPVER (CRITICAL FOR ERA5T)
            # If expver exists, it means we have overlapping versions. 
            # We combine them to get a single timeline.
            if 'expver' in ds.coords:
                # Combine consolidated (1) and preliminary (5)
                try:
                    ds = ds.sel(expver=1).combine_first(ds.sel(expver=5))
                except:
                    pass # If selection fails, it might not need combining

            # 2. Select Nearest Point
            point_ds = ds.sel(latitude=TARGET_LAT, longitude=TARGET_LON, method='nearest')
            
            # 3. Convert to DataFrame
            df = point_ds.to_dataframe().reset_index()
            
            # 4. Keep and Rename Columns
            # We keep the original ERA5 names so the schema is consistent
            target_cols = ['valid_time', 'u10', 'v10', 'msl', 'shts', 'mpts', 'mdts']
            
            # Filter to ensure columns exist
            existing_cols = [c for c in target_cols if c in df.columns]
            df = df[existing_cols].rename(columns={'valid_time': 'time'})
            
            all_dfs.append(df)
            ds.close()
            
        except Exception as e:
            print(f" Error reading {f}: {e}")

    # 5. Merge and Save
    if all_dfs:
        final_df = pd.concat(all_dfs).sort_values('time').reset_index(drop=True)
        
        # Remove duplicates
        final_df = final_df.drop_duplicates(subset=['time'])
        
        # Handle any remaining NaNs (interpolating briefly if single steps are missing)
        final_df = final_df.interpolate(method='linear', limit_direction='both')
        
        final_df.to_csv(OUTPUT_CSV, index=False)
        print(f" Saved {len(final_df)} rows to {OUTPUT_CSV}")
        print(f" Variables: {list(final_df.columns)}")
        print(f" Range: {final_df['time'].min()} -> {final_df['time'].max()}")
    else:
        print("Failed to build history.")

if __name__ == "__main__":
    main()

 Historical Data FOR Virtual Buoy Point (7.0, 82.0)...
   Processing surf_data_2020.nc...
   Processing surf_data_2021.nc...
   Processing surf_data_2022.nc...
   Processing surf_data_2023.nc...
   Processing surf_data_2024.nc...
   Processing surf_data_2025.nc...
 Saved 8591 rows to arugambay_virtual_bouy_data.csv
 Variables: ['time', 'u10', 'v10', 'msl', 'shts', 'mpts', 'mdts']
 Range: 2020-01-01 00:00:00 -> 2025-11-17 12:00:00
