# Notebook for SOM Preprocessing

By: Ty Janoski

Updated 12/9/2025

## Setup

### Imports

In [1]:
# Import Statements
import cartopy.crs as ccrs
import cmweather  # noqa: F401
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scienceplots  # noqa: F401
import xarray as xr

plt.style.use(["science", "nature", "grid"])
plt.rcParams["text.usetex"] = True


### Data Read-In

#### CSV

In [6]:
# Read in CSV file with flash flood events
df = pd.read_csv("../data/storm_data_search_results.csv")


#### ERA5 Output

In [None]:
# Only run the following code if you need to reprocess the ERA5 data
# --- IGNORE ---

# # Read in hourly, sliced Z500 heights
# Z500 = xr.open_mfdataset(
#     "/mnt/drive2/ERA5/NC_files/hourly_sliced/era5_z500_*.nc",
#     combine="nested",
#     concat_dim="time",
#     decode_times=True,
#     chunks={"time": 8760},
# ).rename({"var129": "Z500"})

# # Sort by time
# Z500 = Z500.sortby("time").Z500.squeeze()

# # Save out
# with ProgressBar():
#     Z500.rename("Z500").load()

# Z500.to_netcdf("/mnt/drive2/ERA5/NC_files/combined/era5_Z500_hourly_warm_season_US.nc")

In [7]:
# Read in ERA5 Z500 data
Z500 = xr.load_dataarray(
    "/mnt/drive2/ERA5/NC_files/combined/era5_Z500_hourly_warm_season_US.nc",
    decode_timedelta=True,
)


## Data Processing

### Flash Flood Times

In [8]:
# Remove rows where EVENT_ID is not a digit
df = df[df["EVENT_ID"].astype(str).str.isdigit()]

# Turn BEGIN_TIME and END_TIME into strings with leading zeros if necessary
df["BEGIN_TIME"] = df["BEGIN_TIME"].fillna(0).astype(int).astype(str).str.zfill(4)
df["END_TIME"] = df["END_TIME"].fillna(0).astype(int).astype(str).str.zfill(4)

# Combine TIME and DATE into a single datetime string
begin_str = df["BEGIN_DATE"] + " " + df["BEGIN_TIME"] # type: ignore
end_str = df["END_DATE"] + " " + df["END_TIME"] # type: ignore

# Convert the datetime strings to pandas datetime objects
df["BEGIN_DATETIME"] = pd.to_datetime(
    begin_str, format="%m/%d/%Y %H%M", errors="coerce"
)
df["END_DATETIME"] = pd.to_datetime(end_str, format="%m/%d/%Y %H%M", errors="coerce")

# Take only the first row for each EPISODE_ID
df_unique = df.drop_duplicates(subset=["EPISODE_ID"], keep="first").copy()

# Create a list of datetimes rounded to the nearest preceding hour
event_hours = df_unique["BEGIN_DATETIME"].dt.floor("h").tolist() # type: ignore

### ERA5 Data

In [9]:
# Group by day of year to calculate mean and std dev
mean_doy = Z500.groupby("time.dayofyear").mean(dim="time")
std_doy = Z500.groupby("time.dayofyear").std(dim="time")

std_doy_smooth = (
    std_doy
    .sortby("dayofyear")
    .rolling(dayofyear=14, center=True, min_periods=7)
    .mean()
)

# Calculate standardized anomalies
Z500_anoms = Z500.groupby("time.dayofyear") - mean_doy
Z500_norm  = Z500_anoms.groupby("time.dayofyear") / std_doy_smooth

# Weight by square root of cosine of latitude
weights = np.sqrt(np.cos(np.deg2rad(Z500_norm.lat)))
Z500_norm_weighted = Z500_norm * weights

In [43]:
# Create an example plot of the standardized anomalies
fig, axs = plt.subplots(
    1, 3, figsize=(6.5, 3.5), subplot_kw={"projection": ccrs.PlateCarree()}, dpi=600
)

# Define plot configurations
configs = [
    {
        "data": Z500.isel(time=0) / 10 / 9.81,
        "lon": Z500.lon,
        "lat": Z500.lat,
        "title": "Z$_{500}$",
        "cmap": "viridis",
        "levels": np.arange(522, 595, 6),
    },
    {
        "data": Z500_norm.isel(time=0),
        "lon": Z500_norm.lon,
        "lat": Z500_norm.lat,
        "title": "Z$_{500}$ Standardized Anomalies",
        "cmap": "balance",
        "levels": np.arange(-3, 3.1, 0.5),
    },
    {
        "data": Z500_norm_weighted.isel(time=0),
        "lon": Z500_norm_weighted.lon,
        "lat": Z500_norm_weighted.lat,
        "title": "Z$_{500}$ Standardized Anomalies Weighted",
        "cmap": "balance",
        "levels": np.arange(-3, 3.1, 0.5),
    },
]

# Create plots and colorbars
for i, config in enumerate(configs):
    c = axs[i].contourf(
        config["lon"],
        config["lat"],
        config["data"],
        cmap=config["cmap"],
        levels=config["levels"],
    )
    axs[i].set_title(config["title"], fontsize=8)
    axs[i].coastlines(resolution="50m", linewidth=0.5)

    cb = fig.colorbar(c, ax=axs[i], orientation="horizontal", pad=0.03)
    if i == 0:
        cb.set_label("dam", fontsize=6)
    cb.set_ticks(c.levels)
    cb.ax.tick_params(labelsize=6, rotation=45)

plt.tight_layout()
plt.savefig(
    "figs/Z500-SOM/Z500_standardized_anomalies_example.png",
    dpi=600,
    bbox_inches="tight",
)
plt.close()


### Intersecting Times

In [41]:
# Extract data for event hours
# The times are actual NYC local, so we need to convert them to UTC
times_local = pd.to_datetime(event_hours).tz_localize("US/Eastern")
times_utc = times_local.tz_convert("UTC").tz_convert(None)

intersect = pd.Index(times_utc).intersection(Z500_norm_weighted.indexes['time'])
Z500_norm_weighted_ffe = Z500_norm_weighted.sel(time=intersect)
Z500_norm_ffe = Z500_norm.sel(time=intersect)
Z500_ffe = Z500.sel(time=intersect)

## Save out data

In [10]:
# Save out the processed data
# Z500_norm_weighted_ffe.to_netcdf(
#     "/mnt/drive2/SOM_intermediate_files/era5_Z500_norm_weighted_ffe.nc"
# )
# Z500_norm_ffe.to_netcdf("/mnt/drive2/SOM_intermediate_files/era5_Z500_norm_ffe.nc")
# Z500_ffe.to_netcdf("/mnt/drive2/SOM_intermediate_files/era5_Z500_ffe.nc")

Z500_norm_weighted.to_netcdf("/mnt/drive2/SOM_intermediate_files/era5_Z500_norm_weighted.nc")
