In [1]:
%load_ext autoreload
%autoreload 2

# Standard imports
from pathlib import Path
import sys

# Library imports
import numpy as np
import pandas as pd
import xarray as xr

# Util imports
sys.path.append("../../")

# Overlay station data to grids

### Input parameters

In [2]:
CITY_NAME = "Dagupan"
VARS = ["precip", "tmax", "tmin"]
STATION_RESOLUTION_DEGREES = 0.25

RAW_PATH = Path("../../data/01-raw")
PROCESSED_PATH = Path("../../data/02-processed")
CORRECTED_PATH = PROCESSED_PATH / "bias-correction"
CORRECTED_PATH.mkdir(parents=True, exist_ok=True)

DOMAINS_GEOJSON = RAW_PATH / "domains/downscaling_domains_fixed.geojson"
STATION_LOCATION_CSV = RAW_PATH / "station_data/PAGASA_station_locations.csv"
STATION_DATA_CSV = PROCESSED_PATH / "station_data.csv"

STATION_NC = CORRECTED_PATH / f"station_{CITY_NAME.lower()}.nc"
GRIDDED_NC = (
    PROCESSED_PATH
    / f"input/chirts_chirps_regridded_interpolated_{CITY_NAME.lower()}.nc"
)
GRIDDED_SUBSET_NC = CORRECTED_PATH / f"gridded_{CITY_NAME.lower()}.nc"

## Station data

### Load station location

In [3]:
station_locations_df = pd.read_csv(STATION_LOCATION_CSV)
station_locations_df.head()
station_lats = station_locations_df.loc[
    station_locations_df["station_name"] == CITY_NAME, "lat"
]
station_lons = station_locations_df.loc[
    station_locations_df["station_name"] == CITY_NAME, "lon"
]
station_lat = station_lats.item()
station_lon = station_lons.item()

### Load station data

In [4]:
stations_df = pd.read_csv(STATION_DATA_CSV)
station_df = (
    stations_df[stations_df["station"] == CITY_NAME]
    .drop_duplicates()
    .replace(-999, np.nan)
    .rename(columns={"rainfall": "precip"})
    .sort_values("date")
    .reset_index(drop=True)
)
station_df.head()

Unnamed: 0,station,date,precip,tmax,tmin,tmean,rh,wind_speed,wind_direction
0,Dagupan,2007-01-01,0.0,30.0,20.0,25.0,84.0,2.0,180.0
1,Dagupan,2007-01-02,0.0,31.2,21.5,26.4,82.0,2.0,140.0
2,Dagupan,2007-01-03,0.0,31.8,23.0,27.4,83.0,2.0,180.0
3,Dagupan,2007-01-04,0.2,31.0,21.6,26.3,88.0,2.0,340.0
4,Dagupan,2007-01-05,0.0,31.0,22.5,26.8,86.0,3.0,340.0


### Arrange as a Dataset

In [5]:
station_ds = xr.Dataset(
    data_vars={
        var: (
            ["time", "lat", "lon"],
            station_df[var].to_numpy().reshape((len(station_df["date"]), 1, 1)),
        )
        for var in VARS
    },
    coords=dict(
        time=("time", pd.DatetimeIndex(station_df["date"])),
        lon=("lon", station_lons),
        lat=("lat", station_lats),
    ),
    attrs=dict(
        description="Station data",
    ),
)
station_ds

In [6]:
station_ds.to_netcdf(STATION_NC, engine="scipy")

## Gridded data

### Load gridded data

In [7]:
gridded_ds = xr.open_dataset(GRIDDED_NC, engine="scipy").sel(band=1)
gridded_ds

In [8]:
station_buffer = STATION_RESOLUTION_DEGREES / 2
gridded_subset_ds = gridded_ds.where(
    (gridded_ds.lat >= (station_lat - station_buffer))
    & (gridded_ds.lat <= (station_lat + station_buffer))
    & (gridded_ds.lon >= (station_lon - station_buffer))
    & (gridded_ds.lon <= (station_lon + station_buffer)),
    drop=True,
)
gridded_subset_ds

In [9]:
gridded_subset_ds.to_netcdf(GRIDDED_SUBSET_NC, engine="scipy")