# sktime Time Series Clusterers Catalog


This notebook catalogs sktime clusterers for time series. It provides:
- a quick clustering notation refresher,
- a visual intuition for cluster archetypes,
- a dynamic registry query to list all clusterers in your local sktime install.


## Clustering setup and notation
Given $n$ series $X^{(i)}$, a clusterer assigns labels $\hat{z}_i$ and often optimizes

$$
\sum_{i=1}^n d\bigl(X^{(i)}, \mu_{\hat{z}_i}\bigr).
$$

Distances $d$ can be elastic (DTW-like), feature-based, or model-based depending on the algorithm.


In [1]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

rng = np.random.default_rng(4)
t = np.linspace(0, 1, 70)

prototypes = {
    "oscillatory": np.sin(2 * np.pi * t * 2),
    "trend": 1.3 * t - 0.2,
    "square": np.sign(np.sin(2 * np.pi * t * 3)),
}

fig = make_subplots(rows=1, cols=3, subplot_titles=list(prototypes.keys()), shared_yaxes=True)
for col, (name, proto) in enumerate(prototypes.items(), start=1):
    for _ in range(6):
        y = proto + 0.15 * rng.normal(size=t.size)
        fig.add_trace(
            go.Scatter(x=t, y=y, mode="lines", line=dict(width=1), showlegend=False),
            row=1, col=col,
        )

fig.update_layout(
    title="Cluster archetypes (synthetic)",
    height=300,
    margin=dict(l=20, r=20, t=50, b=20),
)
fig.update_xaxes(showticklabels=False)
fig.show()


## Dynamic registry query


In [2]:
import pandas as pd
import plotly.express as px


def _resolve_all_estimators():
    try:
        from sktime.registry import all_estimators
        return all_estimators
    except Exception:
        from sktime.utils.discovery import all_estimators
        return all_estimators


def _safe_tags(cls):
    if hasattr(cls, "get_class_tags"):
        try:
            return cls.get_class_tags()
        except Exception:
            return {}
    return {}


def _matches_scitype(value, target):
    if value is None:
        return False
    if isinstance(value, (list, tuple, set)):
        return target in value
    return value == target


def _top_tag_keys(tag_dicts, limit=8):
    counts = {}
    for tags in tag_dicts:
        for key, value in tags.items():
            if key in {"scitype", "estimator_type", "task", "learning_type"}:
                continue
            if isinstance(value, (bool, int, float, str)):
                counts[key] = counts.get(key, 0) + 1
    return [
        key
        for key, _ in sorted(counts.items(), key=lambda kv: (-kv[1], kv[0]))[:limit]
    ]


try:
    import sktime  # noqa: F401

    all_estimators = _resolve_all_estimators()
    SKTIME_AVAILABLE = True
except Exception as exc:
    SKTIME_AVAILABLE = False
    _err = exc

if not SKTIME_AVAILABLE:
    print("sktime not installed. Install with `pip install sktime` to populate the catalog.")
    print("Error:", _err)


sktime not installed. Install with `pip install sktime` to populate the catalog.
Error: No module named 'sktime'


In [3]:
if SKTIME_AVAILABLE:
    target = "clusterer"
    estimators = all_estimators()
    records = []
    tag_dicts = []
    for name, cls in estimators:
        tags = _safe_tags(cls)
        scitype = tags.get("scitype") or tags.get("estimator_type")
        if not _matches_scitype(scitype, target):
            continue
        records.append(
            {
                "name": name,
                "class": cls.__name__,
                "module": cls.__module__,
                "module_family": ".".join(cls.__module__.split(".")[:3]),
            }
        )
        tag_dicts.append(tags)

    tag_keys = _top_tag_keys(tag_dicts, limit=8)
    rows = []
    for record, tags in zip(records, tag_dicts):
        row = record.copy()
        for key in tag_keys:
            row[key] = tags.get(key)
        rows.append(row)

    df = pd.DataFrame(rows).sort_values("name")
    df.head(20)


In [4]:
if SKTIME_AVAILABLE and not df.empty:
    counts = (
        df["module_family"]
        .value_counts()
        .reset_index()
        .rename(columns={"index": "module_family", "module_family": "count"})
    )
    fig = px.bar(
        counts,
        x="module_family",
        y="count",
        title="Clusterers by module family",
    )
    fig.show()

    fig = px.treemap(
        df,
        path=["module_family", "name"],
        title="Clusterers catalog (module family -> estimator)",
    )
    fig.show()

    bool_tags = []
    for key in df.columns:
        if key in {"name", "class", "module", "module_family"}:
            continue
        series = df[key].dropna()
        if not series.empty and series.isin([True, False]).all():
            bool_tags.append(key)

    if bool_tags:
        summary = pd.DataFrame(
            {
                "tag": bool_tags,
                "share_true": [df[tag].mean() for tag in bool_tags],
            }
        )
        fig = px.bar(
            summary,
            x="tag",
            y="share_true",
            title="Share of clusterers with tag=True",
        )
        fig.show()


## How to use this catalog
- Use module families to find elastic-distance or feature-based clusterers.
- Filter by tag columns for multivariate or unequal-length support.
- Pair with transformers for smoothing or feature extraction before clustering.