# Matrix Profile on Simulated Time Series

This notebook demonstrates a compact workflow:
1. Generate synthetic data with motifs, regime shifts, and anomalies
2. Compute univariate (`stump`) and multidimensional (`mstump`) Matrix Profile
3. Build features, cluster in PCA space, and visualize results


In [None]:
from __future__ import annotations

import numpy as np
import matplotlib.pyplot as plt

from src.clustering import cluster_features
from src.config import Config
from src.features import build_feature_frame
from src.matrix_profile import compute_multidimensional_mp, compute_univariate_mp
from src.simulate import simulate_univariate_series

plt.style.use("default")


In [None]:
cfg = Config(n=3000, m=100, random_seed=42)

ts, metadata = simulate_univariate_series(
    n=cfg.n,
    seed=cfg.random_seed,
    motif_len=cfg.motif_len,
    n_motifs=cfg.n_motifs,
    noise_std=cfg.noise_std,
    regime_shift=cfg.regime_shift,
    anomalies=cfg.anomalies,
)

mp_uni = compute_univariate_mp(ts, cfg.m)
X_multi = np.vstack([ts, np.gradient(ts)])
mp_multi = compute_multidimensional_mp(X_multi, cfg.m)

features = build_feature_frame(ts, mp_uni, rolling_window=cfg.feature_rolling_window)
clustered, pca_model, kmeans_model = cluster_features(
    features, n_components=cfg.pca_components, k=cfg.n_clusters, random_state=cfg.random_seed
)

print(f"Series length: {len(ts)}")
print(f"Univariate MP length: {len(mp_uni['mp'])}")
print(f"Multidimensional MP shape: {mp_multi['mp'].shape}")


## Simulated signal structure

Motifs are repeated subsequences, anomalies are isolated abnormal points (or short level shifts), and regime boundaries separate segments with different distributional behavior.

In [None]:
fig, ax = plt.subplots(figsize=(14, 4))
ax.plot(ts, color="#2f5d80", linewidth=1.0, label="time series")

for i, pos in enumerate(metadata["motif_positions"]):
    ax.axvline(pos, color="#2a9d8f", linestyle="--", alpha=0.75, label="motif start" if i == 0 else None)
    ax.axvspan(pos, pos + metadata["motif_len"], color="#2a9d8f", alpha=0.08)

if metadata["anomaly_positions"]:
    anom = np.array(metadata["anomaly_positions"], dtype=int)
    ax.scatter(anom, ts[anom], color="#d62828", s=24, label="anomaly")

for i, b in enumerate(metadata["regime_boundaries"]):
    ax.axvline(b, color="black", linestyle=":", alpha=0.9, label="regime boundary" if i == 0 else None)

ax.set_title("Synthetic time series with motifs, anomalies, and regime shifts")
ax.set_xlabel("time index")
ax.set_ylabel("value")
ax.legend(loc="upper right")
plt.show()


## Matrix Profile views

Matrix Profile stores, for each subsequence, the nearest-neighbor distance. Lower values usually indicate repeated structure (motifs), while high values can indicate rare events or transitions.

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(14, 7), sharex=False)

centers_uni = np.arange(len(mp_uni["mp"])) + cfg.m // 2
axes[0].plot(centers_uni, mp_uni["mp"], color="#264653", linewidth=1.2)
axes[0].set_title("Univariate Matrix Profile (stump)")
axes[0].set_ylabel("distance")

centers_multi = np.arange(mp_multi["mp"].shape[1]) + cfg.m // 2
for dim_idx in range(mp_multi["mp"].shape[0]):
    axes[1].plot(centers_multi, mp_multi["mp"][dim_idx], linewidth=1.0, label=f"dim {dim_idx + 1}")
axes[1].set_title("Multidimensional Matrix Profile (mstump)")
axes[1].set_xlabel("time index")
axes[1].set_ylabel("distance")
axes[1].legend()

plt.tight_layout()
plt.show()


In [None]:
fig, axes = plt.subplots(2, 1, figsize=(14, 7), gridspec_kw={"height_ratios": [3, 1]}, sharex=True)
x = clustered.index.to_numpy()
labels = clustered["cluster"].to_numpy()

axes[0].plot(x, clustered["ts"], color="lightgray", linewidth=1.0)
sc = axes[0].scatter(x, clustered["ts"], c=labels, cmap="tab10", s=10)
axes[0].set_title("Cluster assignments over time")
axes[0].set_ylabel("value")

axes[1].step(x, labels, where="mid", color="#1d3557")
axes[1].set_xlabel("time index")
axes[1].set_ylabel("cluster")
fig.colorbar(sc, ax=axes[0], label="cluster")
plt.tight_layout()
plt.show()

fig, ax = plt.subplots(figsize=(7, 5.5))
ax.scatter(clustered["pc1"], clustered["pc2"], c=labels, cmap="tab10", s=14, alpha=0.85)
ax.set_title("PCA latent space")
ax.set_xlabel("PC1")
ax.set_ylabel("PC2")
plt.show()


## Effect of window length `m`

Smaller `m` reacts to short local structure; larger `m` emphasizes longer subsequences and smoother profile behavior.

In [None]:
m_values = [60, 100, 160]
fig, ax = plt.subplots(figsize=(14, 4.5))

for m in m_values:
    mp = compute_univariate_mp(ts, m)
    centers = np.arange(len(mp["mp"])) + m // 2
    ax.plot(centers, mp["mp"], linewidth=1.1, label=f"m={m}")

ax.set_title("Window length sensitivity")
ax.set_xlabel("time index")
ax.set_ylabel("matrix profile distance")
ax.legend()
plt.show()
