# Notebook for Precipitation Data Preprocessing

By: Ty Janoski

Updated 12/11/2025

## Setup

### Imports

In [1]:
# Import Statements
import cartopy.crs as ccrs
import cmweather  # noqa: F401
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scienceplots  # noqa: F401
import xarray as xr
from dask.diagnostics.progress import ProgressBar  # noqa: F401

plt.style.use(["science", "nature", "grid"])
plt.rcParams["text.usetex"] = True


### Data Read-In

#### CSV

In [2]:
# Read in CSV file with flash flood events
df = pd.read_csv("data/storm_data_search_results.csv")


#### ERA5 Output

In [14]:
# Read in hourly, sliced total precipitation (tp) data
tp = xr.open_mfdataset(
    "/mnt/drive2/ERA5/NC_files/hourly_sliced/era5_tp_*.nc",
    combine="by_coords",
    decode_times=True,
    chunks={"valid_time": 8760},
)

# Save out
with ProgressBar():
    tp.load()

[########################################] | 100% Completed | 40.85 s


## Data Processing

### Flash Flood Times

In [15]:
# Remove rows where EVENT_ID is not a digit
df = df[df["EVENT_ID"].astype(str).str.isdigit()]

# Turn BEGIN_TIME and END_TIME into strings with leading zeros if necessary
df["BEGIN_TIME"] = df["BEGIN_TIME"].fillna(0).astype(int).astype(str).str.zfill(4)
df["END_TIME"] = df["END_TIME"].fillna(0).astype(int).astype(str).str.zfill(4)

# Combine TIME and DATE into a single datetime string
begin_str = df["BEGIN_DATE"] + " " + df["BEGIN_TIME"] # type: ignore
end_str = df["END_DATE"] + " " + df["END_TIME"] # type: ignore

# Convert the datetime strings to pandas datetime objects
df["BEGIN_DATETIME"] = pd.to_datetime(
    begin_str, format="%m/%d/%Y %H%M", errors="coerce"
)
df["END_DATETIME"] = pd.to_datetime(end_str, format="%m/%d/%Y %H%M", errors="coerce")

# Take only the first row for each EPISODE_ID
df_unique = df.drop_duplicates(subset=["EPISODE_ID"], keep="first").copy()

# Create a list of datetimes rounded to the nearest following hour
event_hours = df_unique["BEGIN_DATETIME"].dt.ceil("h").tolist() # type: ignore

### Intersecting Times

In [16]:
# Extract data for event hours
# The times are actual NYC local, so we need to convert them to UTC
times_local = pd.to_datetime(event_hours).tz_localize("US/Eastern")
times_utc = times_local.tz_convert("UTC").tz_convert(None)

intersect = pd.Index(times_utc).intersection(tp.indexes['valid_time'])
tp_ffe = tp.tp.sel(valid_time=intersect)

## Save out data

In [17]:
# Save out the processed data
tp_ffe.to_netcdf(
    "/mnt/drive2/SOM_intermediate_files/era5_tp_ffe.nc"
)
