In [None]:
import pandas as pd
import numpy as np
import hvplot.pandas

# Comparing Time Series with Different Time Steps

In [None]:
ROOT = ""
station = "abur" #"abed"
sensor =  "rad" #"bub"

def load_ts(station, sensor):
    obs = pd.read_parquet(ROOT + f"data/obs/{station}_{sensor}.parquet")
    sim = pd.read_parquet(ROOT + f"data/models/seareport-v2.2/{station}.parquet")
    obs = pd.Series(obs[obs.columns[0]], name = "obs")
    sim = pd.Series(sim[sim.columns[0]], name = "sim")
    sim = sim.sort_index()
    return sim, obs
sim, obs = load_ts(station, sensor)

Let's test the different approaches on one week of data: it's more than enough

In [None]:
YEAR = 2022
sim_subset = sim.loc[f"{YEAR}-09-13":f"{YEAR}-09-20"]
obs_subset = obs.loc[f"{YEAR}-09-13":f"{YEAR}-09-20"]

In [None]:
sim.sort_index().index.diff().dropna().value_counts()
obs.sort_index().index.diff().dropna().value_counts()

In [None]:
(obs_subset.hvplot()*sim_subset.hvplot()).opts(width=1300, height=800, title = f"simple comparison: model vs. observation, station: '{station}'")

## 1. Nearest-Neighbor Alignment

**Method**: For each model timestamp, find the closest observation timestamp within a defined tolerance window
**Advantages**:
 * No interpolation (no fictional data)
 * Preserves actual observation values

**Cons**:
 * No control on the aligned signal, resulting in: 
   * missing peaks 
   * or even missing the trend (if there is noise signal looks chaotic)

In [None]:
aligned_data = pd.merge_asof(
    sim_subset, obs_subset, 
    left_index=True, right_index=True,
    tolerance=pd.Timedelta('2min'),  # Set appropriate tolerance
    direction='nearest'
)
aligned_data = aligned_data.rename(columns={"obs": "obs_aligned"})
aligned_data.sort_index().index.diff().dropna().value_counts()
(obs_subset.hvplot() * aligned_data.hvplot()).opts(width=1300, height = 800,  title=f"Method 1. Nearest-Neighbor Alignment, station: '{station}'")

not insteresting enough to be considered 

## Method 2. Window-Based Aggregation

**Method**: Use the model timestamps as reference points and aggregate observations within a window

**Advantages**:
 * we can choose between max or mean

**Cons**
 * We drop some maxima (outside ouf the averaging window) if the window is not adapted
 * We end loosing information, because dropping data points

In [None]:
def aggregate(sim, obs, window_size='7min'):
    all_times = pd.DatetimeIndex(sorted(set(sim.index) | set(obs.index)))
    
    full_obs = pd.Series(np.nan, index=all_times)
    full_obs.loc[obs.index] = obs
    
    window = pd.Timedelta(window_size)
    rolling_stats = pd.DataFrame({
        'obs_mean': full_obs.rolling(window=window, center=True).mean(),
        'obs_max': full_obs.rolling(window=window, center=True).max(),
        'obs_count': full_obs.rolling(window=window, center=True).count()
    })
    
    result = pd.DataFrame({'sim': sim})
    result = result.join(rolling_stats)
    result = result[result['obs_count'] > 0].copy()
    
    return result

df1 = aggregate(sim_subset, obs_subset)
df1

In [None]:
df2 = aggregate(sim_subset, obs_subset, window_size="2min")
df2

In [None]:
(obs_subset.hvplot() 
 * sim_subset.hvplot()
 * df1.obs_max.hvplot(label = "obs_max window: 7min")
 * df2.obs_max.hvplot(label = "obs_max window: 2min")
).opts(width=1300, height=800, title = f"Method 2. Window-Based Aggregation, station: '{station}'")

## 3. Interpolating the model on the observed TS index

Advantages: 
 * No observation data is dropped

Cons: 

 * We create fictional data (for the model TS)
 * May result in heavy process if observed signal has high sample rate

In [None]:
def sim_on_obs(sim, obs):
    sim = sim.drop_duplicates()
    obs = obs.drop_duplicates()


    df = pd.merge(sim, obs, left_index=True, right_index=True, how='outer')
    df = df.drop_duplicates()
    df = df[~df.index.duplicated(keep='first')]
    print("merged df:")
    print(df.iloc[:30])
    df['sim'] = df['sim'].interpolate(method="cubic", limit_direction="both")
    df = df.dropna(subset=['obs'])
    
    return df

df = sim_on_obs(sim_subset, obs_subset)
print("final df: ")
df

In [None]:
sim_, obs_ = df["sim"], df["obs"]
(obs_ == obs_subset).all()

In [None]:
(obs_.hvplot() * sim_.hvplot(label='model: interpolated') * sim_subset.hvplot(label='model: original')).opts(
    width=1300, height=800, title = f"Method 3. Interpolating model on observation TS, station: '{station}'")

testing now with a station with a lower sampling rate than the simulation: 

In [None]:
station = "abed"
sensor = "bub"

sim, obs = load_ts(station, sensor)
sim.sort_index().index.diff().dropna().value_counts()
obs.sort_index().index.diff().dropna().value_counts()

YEAR = 2024
sim_subset = sim.loc[f"{YEAR}-01-22":f"{YEAR}-02-01"]
obs_subset = obs.loc[f"{YEAR}-01-22":f"{YEAR}-02-01"]

In [None]:
df = aggregate(sim_subset, obs_subset)
df
(obs_subset.hvplot() 
 * sim_subset.hvplot()
 * df.obs_max.hvplot(label = "obs_max window: 7min")
).opts(width=1300, height=800,  title = f"Method 2. Window-Based Aggregation, station: '{station}'")

In [None]:
df = sim_on_obs(sim_subset, obs_subset)
sim_, obs_ = df["sim"], df["obs"]
(obs_.hvplot() * sim_.hvplot(label='model: interpolated') * sim_subset.hvplot(label='model: original')).opts(
    width=1300, height=800, title = f"Method 3. Interpolating model on observation TS, station: '{station}'")

Compare stats for the whole series

In [None]:
station = "abur"
sensor = "rad"

sim, obs = load_ts(station, sensor)

In [None]:
import seastats
df = sim_on_obs(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs"], quantile = 0.)
ext3 = seastats.storms.match_extremes(df["sim"], df["obs"], quantile = 0.99)
m3 = pd.DataFrame(stats, index = ["method 3"])

df = aggregate(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs_max"], quantile = 0.)
ext2 = seastats.storms.match_extremes(df["sim"], df["obs_max"], quantile = 0.99)
m2 = pd.DataFrame(stats, index = ["method 2"])
pd.concat([m2,m3])

In [None]:
ext2.iloc[:5] # method 2

In [None]:
ext3.iloc[:5] # method 3

In [None]:
station = "abed"
sensor = "bub"

sim, obs = load_ts(station, sensor)

In [None]:
import seastats
df = sim_on_obs(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs"], quantile = 0.)
ext3 = seastats.storms.match_extremes(df["sim"], df["obs"], quantile = 0.99)
m3 = pd.DataFrame(stats, index = ["method 3"])

df = aggregate(sim, obs)
stats = seastats.get_stats(df["sim"], df["obs_max"], quantile = 0.)
ext2 = seastats.storms.match_extremes(df["sim"], df["obs_max"], quantile = 0.99)
m2 = pd.DataFrame(stats, index = ["method 2"])
pd.concat([m2,m3])