In [None]:
import pandas as pd
import numpy as np
import hvplot.pandas

# Comparing Time Series with Different Time Steps

In [None]:
ROOT = ""

obs = pd.read_parquet(ROOT + "data/obs/abur_rad.parquet")
sim = pd.read_parquet(ROOT + "data/models/seareport-v2.2/abur.parquet")
obs = pd.Series(obs[obs.columns[0]], name = "obs")
sim = pd.Series(sim[sim.columns[0]], name = "sim")
sim = sim.sort_index()

Let's test the different approaches on one week of data: it's more than enough

In [None]:
YEAR = 2022
sim_subset = sim.loc[f"{YEAR}-09-13":f"{YEAR}-09-20"]
obs_subset = obs.loc[f"{YEAR}-09-13":f"{YEAR}-09-20"]

In [None]:
sim.sort_index().index.diff().dropna().value_counts()
obs.sort_index().index.diff().dropna().value_counts()

In [None]:
(obs_subset.hvplot()*sim_subset.hvplot()).opts(width=1300, height=800)

## 1. Nearest-Neighbor Alignment

**Method**: For each model timestamp, find the closest observation timestamp within a defined tolerance window
**Advantages**:
 * No interpolation (no fictional data)
 * Preserves actual observation values

**Cons**:
 * No control on the aligned signal, resulting in: 
   * missing peaks 
   * or even missing the trend (if there is noise signal looks chaotic)

In [None]:
aligned_data = pd.merge_asof(
    sim_subset, obs_subset, 
    left_index=True, right_index=True,
    tolerance=pd.Timedelta('2min'),  # Set appropriate tolerance
    direction='nearest'
)
aligned_data = aligned_data.rename(columns={"obs": "obs_aligned"})
aligned_data.sort_index().index.diff().dropna().value_counts()
(obs_subset.hvplot() * aligned_data.hvplot()).opts(width=1300, height = 800)

## 2. Window-Based Aggregation

**Method**: Use the model timestamps as reference points and aggregate observations within a window

**Advantages**:
 * we can choose between max or mean

**Cons**
 * We drop some maxima (outside ouf the averaging window)
 * We end loosing information, because dropping data points

In [None]:
def aggregate(sim, obs, window_size='5min'):
    all_times = pd.DatetimeIndex(sorted(set(sim.index) | set(obs.index)))
    
    full_obs = pd.Series(np.nan, index=all_times)
    full_obs.loc[obs.index] = obs
    
    window = pd.Timedelta(window_size)
    rolling_stats = pd.DataFrame({
        'obs_mean': full_obs.rolling(window=window, center=True).mean(),
        'obs_max': full_obs.rolling(window=window, center=True).max(),
        'obs_count': full_obs.rolling(window=window, center=True).count()
    })
    
    result = pd.DataFrame({'sim': sim})
    result = result.join(rolling_stats)
    result = result[result['obs_count'] > 0].copy()
    
    return result

df_ = aggregate(sim_subset, obs_subset)

In [None]:
(obs_subset.hvplot() * sim_subset.hvplot()* df_.obs_max.hvplot()).opts(width=1300, height=800)

## 3. Interpolating the model on the observed TS index

Advantages: 
 * No observation data is dropped

Cons: 

 * We create fictional data (for the model TS)
 * May result in heavy process if observed signal has high sample rate

In [None]:
def sim_on_obs(sim, obs):
    sim = sim.copy()
    obs = obs.copy()

    df = pd.merge(sim, obs, left_index=True, right_index=True, how='outer')
    df['sim'] = df['sim'].interpolate(method="linear", limit_direction="both")
    df = df.dropna(subset=['obs'])
    
    return df["sim"], df["obs"]

sim_, obs_ = sim_on_obs(sim_subset, obs_subset)

In [None]:
(obs_ == obs_subset).all()

In [None]:
(obs_.hvplot() * sim_.hvplot(label='model: interpolated') * sim_subset.hvplot(label='model: original')).opts(width=1300, height=800)