# Tidal Analysis Comparison: UTide vs PyTides

We'll look into this notebook the ways of separating: 
 * the **storm surge** (the meteorologically-driven component) 
 * from the **astronomical tide** (the predictable gravitational component). 

## Study Location

We've chosen **Roscoff, France** as our test case - with some of the largest tidal ranges in Europe (up to 9 meters).
from the IOC database, extracted using [`searvey`](https://github.com/seareport/seareport_models), station `rosc`

## The data and tools compared: 

We'll evaluate the following libraries for the (de)tide analysis:

1. **[`UTide`](https://github.com/wesleybowman/UTide)** - Python version of the MatLab software
2. **[`pytides2`](https://github.com/sahitono/pytides)** - fork from the official repository, working for new versions of python
3. **FES2022**. We'll also compare results against the **FES 2022** global tidal model to see how our calculated coefficients stack up against this state-of-the-art reference.

## Setting Up Our Analysis Toolkit

First, let's import the libraries we'll need. Each serves a specific purpose in our tidal detective work:

In [None]:
import searvey
import shapely
import utide
from pytides2.tide import Tide
import pandas as pd
import hvplot.pandas
import ioc_cleanup as C

We define the `FES_CONSTITUENTS` - these are the tidal components included in the FES 2022 model, representing the most important tidal harmonics globally.

We just reordered the consituent according to their frequencies and importance 

In [None]:
UTIDE_OPTS = {
    "constit": "auto", 
    "method": "ols", 
    "order_constit": "frequency",
    "Rayleigh_min": 0.97,  # High threshold for constituent resolution
    "verbose": True
}

FES_CONSITUENTS = [
    "M2", "S2", "N2", "K2", "2N2", "L2", "T2", "R2", "NU2", "MU2", "EPS2", "LAMBDA2", # Semi-diurnal (twice daily)
    "K1", "O1", "P1", "Q1", "J1", "S1", # Diurnal (once daily)
    "MF", "MM", "MSF", "SA", "SSA", "MSQM", "MTM", # Long period (fortnightly to annual)
    "M4", "MS4", "M6", "MN4", "N4", "S4", "M8", "M3", "MKS2" # Short period (higher harmonics)
]

functions

In [None]:
def data_availability(series: pd.Series, freq="60min") -> float:
    resampled = series.resample(freq).mean()
    data_avail_ratio = 1 - resampled.isna().sum() / len(resampled)
    return float(data_avail_ratio)


def utide_get_coefs(ts: pd.Series, lat: float, resample: int = None)-> dict: 
    UTIDE_OPTS["lat"] = lat
    if resample is not None:
        ts = ts.resample(f"{resample}min").mean()
        ts = ts.shift(freq=f"{resample / 2}min")  # Center the resampled points
    return utide.solve(ts.index, ts, **UTIDE_OPTS)


def utide_surge(ts: pd.Series, lat: float, resample: int = None)-> pd.Series: 
    ts0 = ts.copy()
    coef = utide_get_coefs(ts, lat, resample)
    tidal = utide.reconstruct(ts0.index, coef, verbose = UTIDE_OPTS["verbose"])
    return pd.Series(data=ts0.values - tidal.h, index = ts0.index)


def pytide_get_coefs(ts: pd.Series, resample: int = None) -> dict:
    if resample is not None:
        ts = ts.resample(f"{resample}min").mean()
        ts = ts.shift(freq=f"{resample / 2}min")  # Center the resampled points
    ts = ts.dropna()
    return Tide.decompose(ts, ts.index.to_pydatetime())[0]


def pytides_surge(ts: pd.Series, resample: int = None)-> pd.Series:
    ts0 = ts.copy()
    tide = pytide_get_coefs(ts, resample)
    t0 = ts.index.to_pydatetime()[0]
    hours = (ts.index - ts.index[0]).total_seconds()/3600
    times = Tide._times(t0, hours)
    return pd.Series(ts0.values - tide.at(times), index=ts0.index)


def reduce_coef_to_fes(df: pd.DataFrame):
    res = pd.DataFrame(0.0, index=FES_CONSITUENTS, columns=df.columns)
    common_constituents = df.index.intersection(FES_CONSITUENTS)
    res.loc[common_constituents] = df.loc[common_constituents]
    
    # Report what's different from FES
    not_in_fes_df = df[~df.index.isin(FES_CONSITUENTS)]
    not_in_fes = not_in_fes_df.index.tolist()
    not_in_fes_amps = not_in_fes_df["amplitude"].round(3).tolist()
    missing_fes = set(FES_CONSITUENTS) - set(df.index)
    
    # print(f"Constituents found but not in FES: {not_in_fes}")
    # print(f"Their amplitudes: {not_in_fes_amps}")
    # if missing_fes:
    #     print(f"FES constituents missing from analysis (set to 0): {sorted(missing_fes)}")
    
    return res

study site

In [None]:
station = "rosc"
sensor = "rad"

get 25 years of data

In [None]:
raw = searvey.fetch_ioc_station( 
    station, 
    "2000-01-01", 
    pd.Timestamp.now()
)
raw.describe()

Station Metadata

In [None]:
# Get station metadata
ioc_df = searvey.get_ioc_stations()
_lat = ioc_df[ioc_df.ioc_code == station].lat.values[0]

station_info = ioc_df[ioc_df.ioc_code == station]
print(f"Station: {station_info['location'].values[0]}")
print(f"Latitude: {_lat:.4f}°N")
print(f"Longitude: {station_info['lon'].values[0]:.4f}°E")
print(f"Country: {station_info['country'].values[0]}")
print("\nFull station details:")
station_info

let's clean the data, using [ioc_cleanup](https://github.com/seareport/ioc_cleanup/tree/ioc_2024)

In [None]:
!mkdir -p transformations
import requests
response = requests.get(f'https://raw.githubusercontent.com/seareport/ioc_cleanup/refs/heads/ioc_2024/transformations/{station}_{sensor}.json')
with open(f'transformations/{station}_{sensor}.json', 'wb') as f:
    f.write(response.content)

here's a snapshot at the cleaning trasnformation file

In [None]:
! head -20 "transformations/{station}_{sensor}.json" 

Now let's apply these transformations to clean our data:

In [None]:
# Load and apply quality control transformations
trans = C.load_transformation(station, sensor)
cleaned_data = C.transform(raw, trans)
ts = cleaned_data[sensor]

print(f"Data cleaning complete!")
print(f"Original data points: {len(raw)}")
print(f"Cleaned data points: {len(ts)}")
print(f"Data availability: {data_availability(ts):.1%}")
print(f"Time range raw: {raw.index.min()} to {raw.index.max()}")
print(f"Time range clean: {ts.index.min()} to {ts.index.max()}")

## 1: UTide Analysis

In [None]:
out = utide_get_coefs(ts, _lat, resample=20)
print(f"Found {len(out['name'])} tidal constituents")

Let's organize the UTide results into a clean DataFrame:

In [None]:
def utide_to_df(utide_coef: utide.utilities.Bunch) -> pd.DataFrame:
    return pd.DataFrame({ 
        "amplitude": utide_coef["A"],
        "phase": utide_coef["g"],
        "amplitude_CI": utide_coef["A_ci"],
        "phase_CI": utide_coef["g_ci"]
    }, index=utide_coef["name"])

print("Top 20 tidal constituents by amplitude (UTide):")
print(utide_to_df(out).sort_values('amplitude', ascending=False).head(20))

## Round 2: PyTides Analysis

In [None]:
out_pytides = pytide_get_coefs(ts, 20)
print(f"Found {len(out_pytides.model['constituent'])} tidal constituents")

Let's organize the PyTides results:

In [None]:
def pytides_to_df(pytides_tide: Tide)-> pd.DataFrame:
    constituent_names = [c.name.upper() for c in pytides_tide.model['constituent']]
    return pd.DataFrame(pytides_tide.model, index=constituent_names).drop('constituent', axis=1)

print("Top 20 tidal constituents by amplitude (PyTides):")
print(pytides_to_df(out_pytides).sort_values('amplitude', ascending=False).head(20))

## Comparison
To fairly compare UTide and pytides results, we'll standardize them against the FES 2022 constituent list. This will show us:

1. Which constituents each method found
2. Which constituents are missing from each analysis
3. How the amplitudes compare for common constituents

In [None]:
pytides_reduced_coef = reduce_coef_to_fes(pytides_to_df(out_pytides))
pytides_reduced_coef.head(10)

utide_reduced_coef = reduce_coef_to_fes(utide_to_df(out))
utide_reduced_coef.head(10)

### visual comparison

**What to look for:**
- **Major constituents** (M2, S2, N2, K1, O1) should have similar amplitudes
- **Minor constituents** may show more variation between methods
- **Missing constituents** appear as zero amplitude in one method but not the other

In [None]:
def concat_utide_pytides(pytides_df, utide_df):
    multi_df = pd.concat({"pytides": pytides_df, "utide": utide_df})
    multi_df.index.names = ['method', 'constituent']
    multi_df = multi_df.swaplevel().sort_index()

    available_constituents = multi_df.index.get_level_values('constituent').unique()
    filtered_order = [c for c in FES_CONSITUENTS if c in available_constituents][::-1]
    return  multi_df.reindex(filtered_order, level='constituent')

multi_df_ordered = concat_utide_pytides(pytides_reduced_coef, utide_reduced_coef)
multi_df_ordered

In [None]:
# Create the comparison plot

def plot_comparative_amplitudes(df):
    return df.amplitude.hvplot.barh(
        ylabel="Tidal Constituent",
        xlabel="Amplitude (meters)", 
        by="method", 
        grid=True,
        title=f"Tidal Amplitudes: UTide vs PyTide, station {station}",
        legend='top_right',
        rot=90
    ).opts(
        height=1000, 
        width=1000,
        fontsize={'title': 15, 'labels': 12, 'xticks': 8, 'yticks': 8}
    )

plot_comparative_amplitudes(multi_df_ordered)

### Quantitave comparison

we'll assess the RSS between the all the consituents, taking pytide as the reference: 

RSS is given by: [1]

$$
\operatorname{RSS} = \sum_{i=1}^{n} \left(A_{pytides,i} - A_{utide,i}\right)^2
$$


In [None]:
def compute_rss(df): 
    amp_pytides = df.xs('pytides', level='method')['amplitude']
    amp_utide = df.xs('utide', level='method')['amplitude']
    # Ensure both Series are aligned by index
    amp_pytides, amp_utide = amp_pytides.align(amp_utide, join='inner')
    # Compute RSS
    return ((amp_pytides - amp_utide) ** 2).sum()

print(f"rss for {station} is {compute_rss(multi_df_ordered):.3f}")

we'll iterate though an existing folder, contaning clean data at tide gauge locations. 

In [None]:
DATA_FOLDER = "data"
UTIDE_OPTS["verbose"] = False
import glob
import os 


res = {}
for path in glob.glob("data/*parquet"): 
    ts = pd.read_parquet(path)
    ts = ts[ts.columns[0]]
    root, file_ext = os.path.split(path)
    file, ext = os.path.splitext(file_ext)
    station, sensor = file.split("_")
    _lon = ioc_df[ioc_df.ioc_code == station].lon.values[0]
    _lat = ioc_df[ioc_df.ioc_code == station].lat.values[0]
    try: 
        ut = utide_get_coefs(ts, _lat, resample=20)
        utide_reduced_coef = reduce_coef_to_fes(utide_to_df(ut))
        pt = pytide_get_coefs(ts, 20.)
        pytides_reduced_coef = reduce_coef_to_fes(pytides_to_df(pt))
        multi_df_ordered = concat_utide_pytides(pytides_reduced_coef, utide_reduced_coef)
        rss = compute_rss(multi_df_ordered)
        res[station] = {
            "ioc_code": station,
            "lat": _lat,
            "lon": _lon,
            "rss": rss
        }
        print(f"rss for {station} is {rss:.4f}")
    except Exception as e:
        print(f"couldn't process {station}")

In [None]:
rss_df = pd.DataFrame(res).T

rss_df.rss = rss_df.rss.astype(float)
rss_df.hvplot.points(
    x="lon",
    y="lat",
    c="rss",
    hover_cols = ['ioc_code'],
    s=100,
    geo = True,
    tiles = True
).opts(width = 1000, height = 800, title = "RSS difference between UTide and pytides constituents")

plot tidal amplitude from station:

In [None]:
station= "delf"
sensor = "flt"
ioc_df = searvey.get_ioc_stations()

rsp = 20

_lat = ioc_df[ioc_df.ioc_code == station].lat.values[0]
ts = pd.read_parquet(f"data/{station}_{sensor}.parquet")[sensor]
out_pytides = pytide_get_coefs(ts, rsp)
pytides_reduced_coef = reduce_coef_to_fes(pytides_to_df(out_pytides))
out_utides = utide_get_coefs(ts, _lat, resample=rsp)
utide_reduced_coef = reduce_coef_to_fes(utide_to_df(out_utides))
multi_df_ordered = concat_utide_pytides(pytides_reduced_coef, utide_reduced_coef)
compute_rss(multi_df_ordered)
plot_comparative_amplitudes(multi_df_ordered)

### comparison between tidal residuals

If both methods are working correctly, the tidal residuals - corresponding to the meteorological component - time series should be very similar. 

Significant differences would indicate problems with `utide`, `pytides` or both approaches.

In [None]:
print("Calculating storm surge using both methods...")
rsp = 30
surge_pytides = pytides_surge(ts, resample=rsp)
surge_utide = utide_surge(ts, _lat, resample=rsp)

correlation = surge_pytides.corr(surge_utide)
rmse = ((surge_pytides - surge_utide)**2).mean()**0.5

print(f"--------\n📊 Storm Surge Comparison Results:")
print(f"Correlation coefficient: {correlation:.4f}")
print(f"RMSE between methods: {rmse:.3f} meters")

In [None]:
(surge_pytides.resample("1h").mean().hvplot(label="sugre pytides", grid=True)
 *surge_utide.resample("1h").mean().hvplot(label="surge utide")
 ).opts(
    width=1200,
    height = 500
)

## Second part: chunked detiding (to be continued)

In [None]:
def surge_chunked(ts: pd.Series, lat: float, resample: int = None, max_days: int = 365) -> pd.Series:
    ts0 = ts.copy()
    if resample is not None:
        ts = ts.resample(f"{resample}min").mean()
        ts = ts.shift(freq=f"{resample / 2}min")

    OPTS = {
        "constit": "auto",
        "method": "ols",
        "order_constit": "frequency",
        "Rayleigh_min": 0.97,
        "lat": lat,
        "verbose": True
    }

    detided = pd.Series(index=ts0.index, dtype='float64')

    t_start = ts.index.min()
    t_end = ts.index.max()
    chunk_start = t_start
    chunk_size = pd.Timedelta(days = max_days)

    while chunk_start < t_end:
        current_chunk_size = chunk_size

        while True:
            chunk_end = chunk_start + current_chunk_size
            if chunk_end > t_end:
                chunk_end = t_end

            chunk = ts[chunk_start:chunk_end]
            avail = data_availability(chunk, freq="60min")
            total_days = current_chunk_size.total_seconds()/(3600*24)
            if total_days*avail >= 365*0.9:
                print(f"Detiding chunk {chunk_start.date()} to {chunk_end.date()} ({avail*100:.1f}% available)")
                try:
                    coef = utide.solve(
                        chunk.index,
                        chunk,
                        **OPTS
                    )
                    recon_index = ts0.loc[chunk_start:chunk_end].index
                    tidal = utide.reconstruct(recon_index, coef, verbose=OPTS["verbose"])
                    detided.loc[chunk_start:chunk_end] = ts0.loc[chunk_start:chunk_end].values - tidal.h
                except Exception as e:
                    print(f"UTide failed on chunk {chunk_start} to {chunk_end}: {e}")
                break
            else:
                print(f"Data availability {avail:.1f}% from {chunk_start.date()} to {chunk_end.date()} — expanding chunk.")
                current_chunk_size += pd.Timedelta(days=6*30)
                if chunk_start + current_chunk_size > t_end:
                    print("End of time series reached with insufficient data.")
                    break

        chunk_start = chunk_end

    return detided

In [None]:
chunked = surge_chunked(ts, _lat, 20)

In [None]:
(surge_pytides.resample("1h").mean().hvplot(
    label="sugre pytides", grid=True
 )*surge_utide.resample("1h").mean().hvplot(
    label="surge utide"
 )*chunked.resample("1h").mean().hvplot(
    label="chunked utide"
 )).opts(
    width=1200,
    height = 500
)