# Notebook for SOM Training

By: Ty Janoski

Updated 1/11/2026

## Setup

### Imports

In [32]:
# Import Statements
import glob
import os

import cartopy.crs as ccrs
import cartopy.feature as cfeature
import cmweather  # noqa: F401
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scienceplots  # noqa: F401
import xarray as xr
from minisom import MiniSom
from sklearn.manifold import MDS
from sklearn.metrics import pairwise_distances

plt.style.use(["science", "nature", "grid"])
plt.rcParams["text.usetex"] = True


### Data Loading

In [3]:
# Read in Z500 at flash-flood event times
path = "/mnt/drive2/SOM_intermediate_files/"

# Z500
Z500_daily = xr.load_dataarray(f"{path}era5_Z500_daily.nc")
Z500_norm_daily = xr.load_dataarray(f"{path}era5_Z500_norm_daily.nc")
Z500_norm_weighted_daily = xr.load_dataarray(f"{path}era5_Z500_norm_weighted_daily.nc")

# IVT
IVT_daily = xr.load_dataset(f"{path}era5_ivt_daily.nc")["ivt"]
IVT_norm_daily = xr.load_dataset(f"{path}era5_ivt_norm_daily.nc")["ivt"]
IVT_norm_weighted_daily = xr.load_dataset(f"{path}era5_ivt_norm_weighted_daily.nc")[
    "ivt"
]


### Reshape Data

In [4]:
# Flatten the data for SOM training
Z500_flat = Z500_norm_weighted_daily.stack(
    features=["lat", "lon"]
).values  # shape: (time, space)
IVT_flat = IVT_norm_weighted_daily.stack(
    features=["latitude", "longitude"]
).values  # shape: (time, space)

X = np.concatenate([Z500_flat, IVT_flat], axis=1)  # shape: (time, space*2)


In [None]:
# Read flash flood events and filter to unique episodes
df = pd.read_csv("data/storm_data_search_results.csv")
df = df[df["EVENT_ID"].astype(str).str.isdigit()].drop_duplicates(
    subset=["EPISODE_ID"], keep="first"
)

# Parse begin datetime: combine date and time, convert to UTC
df["BEGIN_DATETIME"] = (
    pd.to_datetime(
        df["BEGIN_DATE"]
        + " "
        + df["BEGIN_TIME"].fillna(0).astype(int).astype(str).str.zfill(4),
        format="%m/%d/%Y %H%M",
        errors="coerce",
    )
    .dt.tz_localize("US/Eastern", ambiguous="NaT", nonexistent="NaT")
    .dt.tz_convert("UTC")
)

# Extract unique event days (timezone-naive for xarray compatibility)
event_days = sorted(df["BEGIN_DATETIME"].dt.floor("D").dt.tz_localize(None).unique())


## SOM Training

We are going to train our SOM with random initialization and online training. We will also use two phases: a "coarse" phase with a larger sigma and learning rate, then a "fine" phase with a smaller learning rate and sigma.

### Set SOM parameters

In [15]:
# Set SOM shape
xdim, ydim = 5, 4

# Set number of iterations for each phase
n1, n2 = 20000, 30000

# Set starting sigmas
sig1, sig2 = 0.6 * np.sqrt(xdim**2 + ydim**2), 2.0

# Set starting learning rates
lr1, lr2 = 0.3, 0.1

# Random seed for reproducibility
random_seed = 42


### Train SOM

In [16]:
# Create SOM instance
som = MiniSom(
    xdim,
    ydim,
    input_len=X.shape[1],
    sigma=sig1,
    learning_rate=lr1,
    decay_function="linear_decay_to_zero",
    sigma_decay_function="linear_decay_to_one",
    neighborhood_function="gaussian",
    random_seed=random_seed,
)

# Initialize random weights
som.random_weights_init(X)

# Random training
som.train_random(X, n1, verbose=True)
print(som.topographic_error(X))

# Phase 2
som._sigma = sig2  # type: ignore
som._learning_rate = lr2
som.train_random(X, n2, verbose=True)
print(som.topographic_error(X))


 [ 20000 / 20000 ] 100% - 0:00:00 left 
 quantization error: 115.35613821769022
0.0001874062968515742
 [ 30000 / 30000 ] 100% - 0:00:00 left 
 quantization error: 114.37786864359009
0.0020614692653673165


### Grab important fields

In [17]:
# Total node number
n_nodes = xdim * ydim

# Get flattened weights
weights = som.get_weights().reshape(xdim * ydim, -1)

# u-matrix
u_matrix = som.distance_map().T

# bmus & hit_map
bmus = np.array([som.winner(x) for x in X])

hit_map = np.zeros((xdim, ydim))
for i, j in bmus:
    hit_map[i, j] += 1
hit_map = hit_map.T

# Sammon Coordinates
D = pairwise_distances(weights)
coords = MDS(
    n_components=2, dissimilarity="precomputed", random_state=42, n_init=4
).fit_transform(D)

# Get lats/lons
lat = Z500_norm_weighted_daily.lat
lon = Z500_norm_weighted_daily.lon

# Dimensions of the spatial field
n_lat = lat.size
n_lon = lon.size
n_features = n_lat * n_lon

# Split weights into Z500 and IVT components
z500_weights = weights[:, :n_features]
ivt_weights = weights[:, n_features:]

# Reshape weights back to spatial dimensions
z500_nodes = z500_weights.reshape(xdim, ydim, n_lat, n_lon)
ivt_nodes = ivt_weights.reshape(xdim, ydim, n_lat, n_lon)




## Plots

### U-matrix and Sammon Map

In [18]:
fig, axes = plt.subplots(1, 2, layout="constrained", figsize=(6, 3), dpi=600)

# u-matrix
im0 = axes[0].imshow(u_matrix, cmap="viridis", origin="lower")
axes[0].set_title("U-Matrix (Mean Inter-Node Distance)", fontsize=7)
fig.colorbar(im0, ax=axes[0], fraction=0.046, pad=0.04, shrink=0.7)

# hit map
im1 = axes[1].imshow(hit_map, cmap="plasma", origin="lower")
axes[1].set_title("Hit Map (Samples per Node)", fontsize=7)
fig.colorbar(im1, ax=axes[1], fraction=0.046, pad=0.04, shrink=0.7)

# axis styling
for ax in axes:
    ax.set_xticks(np.arange(xdim))
    ax.set_yticks(np.arange(ydim))
    ax.set_xlabel("X-index", fontsize=6)
    ax.set_ylabel("Y-index", fontsize=6)

plt.savefig("figs/Z500-IVT-big-SOM//Z500_som_u_matrix_hit_map.png")
plt.close()

In [19]:
# Flatten u-matrix & hit map
U_flat = u_matrix.T.reshape(-1)  # back to (n_nodes,)
hits_flat = hit_map.T.reshape(-1)  # back to (n_nodes,)

# scale hits
hits_scaled = 30 + 250 * (hits_flat / hits_flat.max())

# plot
plt.figure(figsize=(7, 7))

# Scatter: U controls color, hits control bubble size
sc = plt.scatter(
    coords[:, 0],
    coords[:, 1],
    c=U_flat,
    s=hits_scaled,
    cmap="balance",
    edgecolor="k",
    linewidth=0.5,
    zorder=3,
)

# Draw lattice connections (right & down neighbors only)
for i in range(xdim):
    for j in range(ydim):
        node = i * ydim + j

        # right neighbor
        if j + 1 < ydim:
            nbr = i * ydim + (j + 1)
            plt.plot(
                [coords[node, 0], coords[nbr, 0]],
                [coords[node, 1], coords[nbr, 1]],
                "k-",
                lw=0.6,
                alpha=0.4,
            )

        # down neighbor
        if i + 1 < xdim:
            nbr = (i + 1) * ydim + j
            plt.plot(
                [coords[node, 0], coords[nbr, 0]],
                [coords[node, 1], coords[nbr, 1]],
                "k-",
                lw=0.6,
                alpha=0.4,
            )

# Node labels (i,j)
for idx, (x, y) in enumerate(coords):
    ix, iy = divmod(idx, ydim)
    plt.text(x, y, f"({ix},{iy})", fontsize=8, ha="center", va="center", zorder=5)

plt.title("Sammon / MDS Distortion Grid\nU-Matrix (Color) \\& Node Frequency (Size)")
plt.axis("off")
plt.colorbar(sc, label="U-Matrix (Avg. Neighbor Distance)")
plt.savefig("figs/Z500-IVT-big-SOM/Z500_som_sammon_mds.png", bbox_inches="tight")
plt.close()


### Node Weights Map

In [34]:
fig, axes = plt.subplots(
    ydim, xdim,
    figsize=(6, 3.7),
    subplot_kw={'projection': ccrs.PlateCarree()},
    constrained_layout=True,
    dpi=600
)

# Shading levels for Z500
levels_Z = np.arange(-1.4, 1.41, 0.2)

# IVT contour levels
levels_ivt = np.arange(-1.8, 1.81, 0.2)

for i in range(xdim):
    for j in range(ydim):
        ax = axes[j, i]

        # Fields for this node
        Z_field = z500_nodes[i, j, :, :]
        ivt_field = ivt_nodes[i, j, :, :]

        # --- Z500 shaded ---
        im = ax.contourf(
            lon,
            lat,
            Z_field,
            cmap="balance",
            levels=levels_Z,
            transform=ccrs.PlateCarree(),
        )

        # --- IVT contours (black depending on preference) ---
        cn = ax.contour(
            lon,
            lat,
            ivt_field,
            colors="black",
            linewidths=0.5,
            levels=levels_ivt,
            transform=ccrs.PlateCarree(),
        )

        ax.add_feature(cfeature.COASTLINE, linewidth=0.5)
        ax.add_feature(cfeature.STATES.with_scale("50m"), linewidth=0.4)
        ax.set_title(f"Node ({i},{j})", fontsize=6)
        ax.set_xticks([])
        ax.set_yticks([])

# One shared colorbar
cbar = fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.6, pad=0.02)
cbar.set_ticks(levels_Z)
cbar.set_label("Standardized 500-hPa Anomaly", fontsize=6)

plt.suptitle("Node Weight Patterns", fontsize=8)
plt.savefig("figs/Z500-IVT-big-SOM/node_weights.png", bbox_inches="tight")
plt.close()

### Anomaly Composite Map

In [35]:
som_days = pd.to_datetime(Z500_norm_daily.time.values).tz_localize(None)

event_mask = np.isin(som_days.normalize(), pd.to_datetime(event_days))
event_indices = np.where(event_mask)[0]

# Create empty arrays for standardized anomalies
z500_patterns = np.full((xdim, ydim, n_lat, n_lon), np.nan)
ivt_patterns = np.full((xdim, ydim, n_lat, n_lon), np.nan)

counts = np.zeros((xdim, ydim), dtype=int)
totals = np.zeros((xdim, ydim), dtype=int)

for i in range(xdim):
    for j in range(ydim):
        # All days assigned to this node
        idx_node = np.where((bmus[:, 0] == i) & (bmus[:, 1] == j))[0]
        totals[i, j] = len(idx_node)

        # Flash-flood days within this node
        idx_event = np.intersect1d(idx_node, event_indices)
        counts[i, j] = len(idx_event)

        # Composite over *all* days in the node
        if len(idx_node) > 0:
            z500_patterns[i, j] = (
                Z500_norm_daily.isel(time=idx_node).mean("time").values
            )
            ivt_patterns[i, j] = (
                IVT_norm_daily.isel(valid_time=idx_node).mean("valid_time").values
            )


risk = np.zeros((xdim, ydim))
risk[totals > 0] = counts[totals > 0] / totals[totals > 0]


In [36]:
fig, axes = plt.subplots(
    ydim,
    xdim,
    figsize=(6, 3.7),
    subplot_kw={"projection": ccrs.PlateCarree()},
    constrained_layout=True,
    dpi=600,
)

# Levels for shading (Z500)
levels_Z = np.arange(-2.0, 2.1, 0.25)

# Fewer contour levels for ivt (to avoid clutter)
levels_ivt = np.arange(-2.5, 2.6, 0.5)

for i in range(xdim):
    for j in range(ydim):
        ax = axes[j, i]

        # pull the Z500 & ivt composite fields for this node
        Z_field = z500_patterns[i, j, :, :]
        ivt_field = ivt_patterns[i, j, :, :]

        # --- ivt shaded composite ---
        im = ax.contourf(
            lon,
            lat,
            ivt_field,
            cmap="balance",
            levels=levels_ivt,
            transform=ccrs.PlateCarree(),
            extend="both",
        )

        # --- z500 contour overlay ---
        ax.contour(
            lon,
            lat,
            Z_field,
            colors="black",
            linewidths=0.5,
            levels=levels_Z,
            transform=ccrs.PlateCarree(),
        )

        ax.add_feature(cfeature.COASTLINE, linewidth=0.6)
        ax.add_feature(cfeature.STATES.with_scale("50m"), linewidth=0.4)
        ax.set_title(
            f"({i},{j})  FFE={counts[i, j]}/{totals[i, j]}  ({100 * risk[i, j]:.1f}\\%)",
            fontsize=5,
        )

        ax.set_xticks([])
        ax.set_yticks([])

# one colorbar
cbar = fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.6, pad=0.02)
cbar.set_label("Standardized Anomaly", fontsize=6)

plt.suptitle(
    "SOM Composite Anomalies: Z500 (contoured) + IVT (shaded)", fontsize=8, y=1.04
)
plt.savefig(
    "figs/Z500-IVT-big-SOM/composite_anomalies.png", bbox_inches="tight"
)
plt.close()


### Composite Mean Map

In [37]:
z500_patterns_raw = np.full((xdim, ydim, n_lat, n_lon), np.nan)
ivt_patterns_raw = np.full((xdim, ydim, n_lat, n_lon), np.nan)
counts = np.zeros((xdim, ydim), dtype=int)
totals = np.zeros((xdim, ydim), dtype=int)

for i in range(xdim):
    for j in range(ydim):
        # All days assigned to this node
        idx_node = np.where((bmus[:, 0] == i) & (bmus[:, 1] == j))[0]
        totals[i, j] = len(idx_node)

        # Flash-flood days within this node
        idx_event = np.intersect1d(idx_node, event_indices)
        counts[i, j] = len(idx_event)

        # Composite over *all* days in the node
        if len(idx_node) > 0:
            z500_patterns_raw[i, j] = Z500_daily.isel(time=idx_node).mean("time").values
            ivt_patterns_raw[i, j] = (
                IVT_daily.isel(valid_time=idx_node).mean("valid_time").values
            )

risk = np.zeros((xdim, ydim))
risk[totals > 0] = counts[totals > 0] / totals[totals > 0]


In [38]:
fig, axes = plt.subplots(
    ydim,
    xdim,
    figsize=(6, 3.7),
    subplot_kw={"projection": ccrs.PlateCarree()},
    constrained_layout=True,
    dpi=600,
)

# Levels for shading (Z500)
levels_Z = range(552, 595, 3)

# IVT levels
levels_ivt = np.arange(0, 701, 100)

for i in range(xdim):
    for j in range(ydim):
        ax = axes[j, i]

        # pull the Z500 & ivt composite fields for this node
        Z_field = z500_patterns_raw[i, j, :, :]
        ivt_field = ivt_patterns_raw[i, j, :, :]

        # --- IVT shaded composite ---
        im = ax.contourf(
            lon,
            lat,
            ivt_field,
            cmap="BuPu",
            levels=levels_ivt,
            transform=ccrs.PlateCarree(),
            extend="max"
        )

        # --- ivt contour overlay ---
        cn = ax.contour(
            lon,
            lat,
            Z_field / 98.1,
            colors="black",
            linewidths=0.5,
            levels=levels_Z,
            transform=ccrs.PlateCarree(),
        )

        ax.add_feature(cfeature.COASTLINE, linewidth=0.6)
        ax.add_feature(cfeature.STATES.with_scale("50m"), linewidth=0.4)
        ax.set_title(
            f"({i},{j})  FFE={counts[i, j]}/{totals[i, j]}  ({100 * risk[i, j]:.1f}\\%)",
            fontsize=5,
        )

        ax.set_xticks([])
        ax.set_yticks([])

        # Add inline labels
        ax.clabel(cn, cn.levels, fontsize=5)

# one colorbar
cbar = fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.6, pad=0.02)
cbar.set_label("IVT (kg m$^{-1}$ s$^{-1}$)", fontsize=6)

plt.suptitle("SOM Composite: IVT (shaded) + Z500 (contoured)", fontsize=8, y=1.04)
plt.savefig("figs/Z500-IVT-big-SOM/composite_mean_IVT_shaded.png", bbox_inches="tight")
plt.close()


### Maps of Individual Nodes

In [39]:
# Set number of columns
cols = 4
proj = ccrs.PlateCarree()

# Clear out existing _FFE.png files in the indiv-nodes directory
ffe_files = glob.glob("figs/Z500-IVT-big-SOM/indiv-nodes/*_FFE.png")
for file in ffe_files:
    os.remove(file)
print(f"Removed {len(ffe_files)} existing _FFE.png files")

# Iterate through each node
for i in range(xdim):
    for j in range(ydim):
        # All days assigned to this node
        idx_node = np.where((bmus[:, 0] == i) & (bmus[:, 1] == j))[0]

        # Restrict to flash-flood days only
        idx = np.intersect1d(idx_node, event_indices)
        n = len(idx)

        # Skip nodes with no flash-flood days
        if n == 0:
            continue

        # Set number of rows
        rows = int(np.ceil(n / cols))

        # Create a figure with subplots
        fig, axes = plt.subplots(
            rows,
            cols,
            figsize=(3 * cols, 2.5 * rows),
            subplot_kw={"projection": proj},
            layout="constrained",
        )

        # Ensure axes is always iterable
        axes = np.atleast_1d(axes).flatten()

        for k, ax in enumerate(axes):
            if k < n:
                t = idx[k]
                z500_data = Z500_daily.isel(time=t)
                ivt_data = IVT_daily.isel(valid_time=t)

                cn = ax.contour(
                    lon,
                    lat,
                    z500_data / 98.1,
                    colors="black",
                    linewidths=0.5,
                    levels=range(546, 595, 3),
                    transform=ccrs.PlateCarree(),
                )

                im = ax.contourf(
                    lon,
                    lat,
                    ivt_data,
                    cmap="BuPu",
                    levels=levels_ivt,
                    transform=ccrs.PlateCarree(),
                    extend="max",
                )

                ax.add_feature(cfeature.COASTLINE, linewidth=0.5)
                ax.add_feature(cfeature.BORDERS, linewidth=0.3)
                ax.add_feature(cfeature.STATES, linewidth=0.2)

                ax.set_title(
                    pd.to_datetime(z500_data.time.values).strftime("%Y-%m-%d"),
                    fontsize=7,
                )
            else:
                ax.axis("off")

        fig.suptitle(
            f"Node ({i},{j})  Flash-Flood Days = {n}",
            fontsize=8,
            y=1.02,
        )

        plt.savefig(
            f"figs/Z500-IVT-big-SOM/indiv-nodes/node_{i}_{j}_FFE.png",
            dpi=300,
            bbox_inches="tight",
        )
        plt.close(fig)


Removed 20 existing _FFE.png files
