diff --git a/.all-contributorsrc b/.all-contributorsrc
index ceaf4768928..eaac5917150 100644
--- a/.all-contributorsrc
+++ b/.all-contributorsrc
@@ -2405,6 +2405,15 @@
         "bug",
         "code"
       ]
+    },
+    {
+      "login": "pirnerjonas",
+      "name": "Jonas Pirner",
+      "avatar_url": "https://avatars.githubusercontent.com/u/48887249?v=4",
+      "profile": "https://github.com/pirnerjonas",
+      "contributions": [
+        "doc"
+      ]
     }
   ]
 }
diff --git a/.github/actions/test-base/action.yml b/.github/actions/test-base/action.yml
index 8c131fd4b75..78b802be054 100644
--- a/.github/actions/test-base/action.yml
+++ b/.github/actions/test-base/action.yml
@@ -4,6 +4,12 @@ inputs:
   python-version-identifier:
     description: python version to run tests
     required: true
+  sub-sample-estimators:
+    description: test only subset of estimators
+    required: true
+  test-affected-estimators:
+    description: test only modified estimators
+    required: true
 runs:
   using: composite
   steps:
@@ -17,7 +23,7 @@ runs:
       run: python3 -m pip install .[tests]
       shell: bash
     - name: unit test step
-      run: python3 -m pytest sktime/base
+      run: python3 -m pytest sktime/base --matrixdesign ${{ inputs.sub-sample-estimators }} --only_changed_modules ${{ inputs.test-affected-estimators }}
       shell: bash
     - name: test coverage step
       uses: codecov/codecov-action@v3
diff --git a/.github/actions/test-component/action.yml b/.github/actions/test-component/action.yml
index 22446c1eee8..6e089e5b737 100644
--- a/.github/actions/test-component/action.yml
+++ b/.github/actions/test-component/action.yml
@@ -7,6 +7,12 @@ inputs:
   python-version-identifier:
     description: python version to run tests
     required: true
+  sub-sample-estimators:
+    description: test only subset of estimators
+    required: true
+  test-affected-estimators:
+    description: test only modified estimators
+    required: true
 runs:
   using: composite
   steps:
@@ -20,7 +26,7 @@ runs:
       run: python3 -m pip install .[${{ inputs.sktime-component-identifier }},tests]
       shell: bash
     - name: unit test step
-      run: python3 -m pytest sktime/${{ inputs.sktime-component-identifier }}
+      run: python3 -m pytest sktime/${{ inputs.sktime-component-identifier }} --matrixdesign ${{ inputs.sub-sample-estimators }} --only_changed_modules ${{ inputs.test-affected-estimators }}
       shell: bash
     - name: test coverage step
       uses: codecov/codecov-action@v3
diff --git a/.github/workflows/test_all.yml b/.github/workflows/test_all.yml
index 5f3c0c33390..371cea14e22 100644
--- a/.github/workflows/test_all.yml
+++ b/.github/workflows/test_all.yml
@@ -24,6 +24,8 @@ jobs:
         uses: ./.github/actions/test-base
         with:
           python-version-identifier: ${{ matrix.python-version }}
+          sub-sample-estimators: "False"
+          test-affected-estimators: "False"
   test_components:
     name: test individual components
     strategy:
@@ -55,3 +57,5 @@ jobs:
         with:
           sktime-component-identifier: ${{ matrix.sktime-component }}
           python-version-identifier: ${{ matrix.python-version }}
+          sub-sample-estimators: "False"
+          test-affected-estimators: "False"
diff --git a/.github/workflows/test_base.yml b/.github/workflows/test_base.yml
index 70ff10eaa8d..1f577607faa 100644
--- a/.github/workflows/test_base.yml
+++ b/.github/workflows/test_base.yml
@@ -42,3 +42,5 @@ jobs:
         uses: ./.github/actions/test-base
         with:
           python-version-identifier: ${{ matrix.python-version }}
+          sub-sample-estimators: "True"
+          test-affected-estimators: "True"
diff --git a/.github/workflows/test_components.yml b/.github/workflows/test_components.yml
index 1642085db4f..5d56d7a096a 100644
--- a/.github/workflows/test_components.yml
+++ b/.github/workflows/test_components.yml
@@ -59,3 +59,5 @@ jobs:
         with:
           sktime-component-identifier: ${{ matrix.sktime-component }}
           python-version-identifier: ${{ matrix.python-version }}
+          sub-sample-estimators: "True"
+          test-affected-estimators: "True"
diff --git a/build_tools/docker/py37.dockerfile b/build_tools/docker/py37.dockerfile
deleted file mode 100644
index 230d23c029a..00000000000
--- a/build_tools/docker/py37.dockerfile
+++ /dev/null
@@ -1,8 +0,0 @@
-FROM python:3.7.16-bullseye
-
-WORKDIR /usr/src/sktime
-
-COPY . .
-
-RUN python -m pip install -U pip
-RUN python -m pip install .[all_extras,dev,binder]
diff --git a/docs/source/api_reference/alignment.rst b/docs/source/api_reference/alignment.rst
index d40a87b29cb..5306ed44cb4 100644
--- a/docs/source/api_reference/alignment.rst
+++ b/docs/source/api_reference/alignment.rst
@@ -38,6 +38,14 @@ Dynamic time warping
 
     AlignerDtwNumba
 
+.. currentmodule:: sktime.alignment.lucky
+
+.. autosummary::
+    :toctree: auto_generated/
+    :template: class.rst
+
+    AlignerLuckyDtw
+
 
 Edit distance based aligners
 ----------------------------
diff --git a/docs/source/api_reference/dists_kernels.rst b/docs/source/api_reference/dists_kernels.rst
index ed8bf29802f..c6201165bd2 100644
--- a/docs/source/api_reference/dists_kernels.rst
+++ b/docs/source/api_reference/dists_kernels.rst
@@ -113,6 +113,7 @@ Dynamic Time Warping Distances
     :template: class.rst
 
     DtwDist
+    DtwPythonDist
     DtwDistTslearn
     SoftDtwDistTslearn
 
@@ -124,6 +125,14 @@ Dynamic Time Warping Distances
 
     CtwDistTslearn
 
+.. currentmodule:: sktime.dists_kernels.lucky
+
+.. autosummary::
+    :toctree: auto_generated/
+    :template: class.rst
+
+    LuckyDtwDist
+
 Time warping distances can also be obtained by composing ``DistFromAligner`` with
 a time warping aligner, see docstring of ``DistFromAligner``:
 
diff --git a/docs/source/developer_guide/continuous_integration.rst b/docs/source/developer_guide/continuous_integration.rst
index 16b72bf7dfb..8b930bb8426 100644
--- a/docs/source/developer_guide/continuous_integration.rst
+++ b/docs/source/developer_guide/continuous_integration.rst
@@ -157,8 +157,6 @@ with the image of name ``PYTHON_VERSION`` based on the following python versions
 +----------------+----------------+
 | Python version | PYTHON_VERSION |
 +================+================+
-|     3.7.16     |      py37      |
-+----------------+----------------+
 |     3.8.16     |      py38      |
 +----------------+----------------+
 |     3.9.16     |      py39      |
@@ -171,8 +169,8 @@ with the image of name ``PYTHON_VERSION`` based on the following python versions
 The dockerized tests can be also executed via `make <https://www.gnu.org/software/make/>`_,
 via the command ``make dockertest PYTHON_VERSION=<python version>``.
 The ``PYTHON_VERSION`` argument specifies the python version and is the same string as in the table above.
-For example, to execute the tests in the Python version ``3.7.16``,
-use ``make dockertest PYTHON_VERSION=py37``.
+For example, to execute the tests in the Python version ``3.8.16``,
+use ``make dockertest PYTHON_VERSION=py38``.
 
 
 Continuous integration
diff --git a/examples/02_classification.ipynb b/examples/02_classification.ipynb
index 8106380adab..0851ebda101 100644
--- a/examples/02_classification.ipynb
+++ b/examples/02_classification.ipynb
@@ -1061,7 +1061,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Evaluation is simila to `sklearn` classifiers - we split a dataset and evaluate performance on the test set.\n",
+    "Evaluation is similar to `sklearn` classifiers - we split a dataset and evaluate performance on the test set.\n",
     "\n",
     "This includes as additional steps:\n",
     "\n",
diff --git a/pyproject.toml b/pyproject.toml
index dc991353821..a6348d26a3b 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,7 +80,7 @@ forecasting = [
     "statsforecast>=0.5.2,<1.7",
     "statsmodels>=0.12.1,<0.15",
     "tbats>=1.1,<1.2",
-    "arch>=5.6.0,<6.2.0",
+    "arch>=5.6.0,<6.3.0",
 ]
 networks = [
     "keras-self-attention>=0.51,<0.52",
@@ -139,7 +139,7 @@ all_extras = [
     "seaborn>=0.11.0",
     "seasonal",
     "skpro>=2.0.0,<2.1.0",
-    "statsforecast>=0.5.2,<1.6.0; python_version < '3.11'",
+    "statsforecast>=0.5.2,<1.7.0",
     "statsmodels>=0.12.1",
     "stumpy>=1.5.1; python_version < '3.11'",
     "tbats>=1.1.0",
@@ -147,7 +147,7 @@ all_extras = [
     "tsfresh>=0.17.0",
     "tslearn>=0.5.2,<0.6.0; python_version < '3.11'",
     "xarray",
-    "arch>=5.6.0,<6.2.0",
+    "arch>=5.6.0,<6.3.0",
 ]
 
 all_extras_pandas2 = [
@@ -175,7 +175,7 @@ all_extras_pandas2 = [
     "seaborn>=0.11.0",
     "seasonal",
     "skpro>=2.0.0,<2.1.0",
-    "statsforecast>=0.5.2,<1.6.0; python_version < '3.11'",
+    "statsforecast>=0.5.2,<1.7.0",
     "statsmodels>=0.12.1",
     "stumpy>=1.5.1; python_version < '3.11'",
     "tbats>=1.1.0",
@@ -183,7 +183,7 @@ all_extras_pandas2 = [
     "tsfresh>=0.17.0",
     "tslearn>=0.5.2,<0.6.0; python_version < '3.11'",
     "xarray",
-    "arch>=5.6.0,<6.2.0",
+    "arch>=5.6.0,<6.3.0",
 ]
 
 cython_extras = [
diff --git a/sktime/alignment/base.py b/sktime/alignment/base.py
index 0ca7b98d932..111866835ee 100644
--- a/sktime/alignment/base.py
+++ b/sktime/alignment/base.py
@@ -337,4 +337,17 @@ def _get_distance_matrix(self):
         distmat: an (n x n) np.array of floats, where n is length of X passed to fit
             [i,j]-th entry is alignment distance between X[i] and X[j] passed to fit
         """
-        raise NotImplementedError
+        # the default implementation assumes
+        # that the aligner can only align two sequences
+        if self.get_tag("capability:multiple-alignment", False):
+            raise NotImplementedError
+
+        import numpy as np
+
+        dist = self.get_distance()
+
+        distmat = np.zeros((2, 2), dtype="float")
+        distmat[0, 1] = dist
+        distmat[1, 0] = dist
+
+        return distmat
diff --git a/sktime/alignment/dtw_numba.py b/sktime/alignment/dtw_numba.py
index 70648c6e36b..b466cec72e0 100644
--- a/sktime/alignment/dtw_numba.py
+++ b/sktime/alignment/dtw_numba.py
@@ -110,8 +110,8 @@ class AlignerDtwNumba(BaseAligner):
     >>> from sktime.utils._testing.series import _make_series
     >>> from sktime.alignment.dtw_numba import AlignerDtwNumba
     >>>
-    >>> X0 = _make_series()  # doctest: +SKIP
-    >>> X1 = _make_series()  # doctest: +SKIP
+    >>> X0 = _make_series(return_mtype="pd.DataFrame")  # doctest: +SKIP
+    >>> X1 = _make_series(return_mtype="pd.DataFrame")  # doctest: +SKIP
     >>> d = AlignerDtwNumba(weighted=True, derivative=True)  # doctest: +SKIP
     >>> align = d.fit([X0, X1]).get_alignment()  # doctest: +SKIP
     """
diff --git a/sktime/alignment/dtw_python.py b/sktime/alignment/dtw_python.py
index e868e3b628b..7334c2b73de 100644
--- a/sktime/alignment/dtw_python.py
+++ b/sktime/alignment/dtw_python.py
@@ -9,7 +9,6 @@
 import pandas as pd
 
 from sktime.alignment.base import BaseAligner
-from sktime.utils.validation._dependencies import _check_soft_dependencies
 
 
 class AlignerDTW(BaseAligner):
@@ -18,30 +17,31 @@ class AlignerDTW(BaseAligner):
     Behaviour: computes the full alignment between X[0] and X[1]
         assumes pairwise alignment (only two series) and univariate
         if multivariate series are passed:
-            alignment is computed on univariate series with variable_to_align;
-            if this is not set, defaults to the first variable of X[0]
+        alignment is computed on univariate series with variable_to_align;
+        if this is not set, defaults to the first variable of X[0]
         raises an error if variable_to_align is not present in X[0] or X[1]
 
     Parameters
     ----------
     dist_method : str, optional, default = "euclidean"
         distance function to use, a distance on real n-space
-            one of the functions in `scipy.spatial.distance.cdist`
+        one of the functions in `scipy.spatial.distance.cdist`
     step_pattern : str, optional, or dtw_python stepPattern object, optional
         step pattern to use in time warping
         one of: 'symmetric1', 'symmetric2' (default), 'asymmetric',
-                and dozens of other more non-standard step patterns;
-                list can be displayed by calling help(stepPattern) in dtw
+        and dozens of other more non-standard step patterns;
+        list can be displayed by calling help(stepPattern) in dtw
     window_type : string, the chosen windowing function
         "none", "itakura", "sakoechiba", or "slantedband"
-            "none" (default) - no windowing
-            "sakoechiba" - a band around main diagonal
-            "slantedband" - a band around slanted diagonal
-            "itakura" - Itakura parallelogram
-    open_begin, open_end : boolean, optional, default=False
+        "none" (default) - no windowing
+        "sakoechiba" - a band around main diagonal
+        "slantedband" - a band around slanted diagonal
+        "itakura" - Itakura parallelogram
+    open_begin : boolean, optional, default=False
+    open_end: boolean, optional, default=False
         whether to perform open-ended alignments
-            open_begin = whether alignment open ended at start (low index)
-            open_end = whether alignment open ended at end (high index)
+        open_begin = whether alignment open ended at start (low index)
+        open_end = whether alignment open ended at end (high index)
     variable_to_align : string, default = first variable in X[0] as passed to fit
         which variable to use for univariate alignment
     """
@@ -203,32 +203,30 @@ def get_test_params(cls, parameter_set="default"):
 class AlignerDTWfromDist(BaseAligner):
     """Aligner interface for dtw-python using pairwise transformer.
 
-        uses transformer for computation of distance matrix passed to alignment
+    Uses transformer for computation of distance matrix passed to alignment.
 
-    Components
+    Parameters
     ----------
     dist_trafo: estimator following the pairwise transformer template
         i.e., instance of concrete class implementing template BasePairwiseTransformer
-
-    Parameters
-    ----------
     step_pattern : str, optional, default = "symmetric2",
-            or dtw_python stepPattern object, optional
+        or dtw_python stepPattern object, optional
         step pattern to use in time warping
         one of: 'symmetric1', 'symmetric2' (default), 'asymmetric',
-                and dozens of other more non-standard step patterns;
-                list can be displayed by calling help(stepPattern) in dtw
+        and dozens of other more non-standard step patterns;
+        list can be displayed by calling help(stepPattern) in dtw
     window_type: str  optional, default = "none"
         the chosen windowing function
         "none", "itakura", "sakoechiba", or "slantedband"
-            "none" (default) - no windowing
-            "sakoechiba" - a band around main diagonal
-            "slantedband" - a band around slanted diagonal
-            "itakura" - Itakura parallelogram
-    open_begin, open_end: boolean, optional, default=False
+        "none" (default) - no windowing
+        "sakoechiba" - a band around main diagonal
+        "slantedband" - a band around slanted diagonal
+        "itakura" - Itakura parallelogram
+    open_begin : boolean, optional, default=False
+    open_end: boolean, optional, default=False
         whether to perform open-ended alignments
-            open_begin = whether alignment open ended at start (low index)
-            open_end = whether alignment open ended at end (high index)
+        open_begin = whether alignment open ended at start (low index)
+        open_end = whether alignment open ended at end (high index)
     """
 
     _tags = {
@@ -236,6 +234,7 @@ class AlignerDTWfromDist(BaseAligner):
         "capability:distance": True,  # does compute/return overall distance?
         "capability:distance-matrix": True,  # does compute/return distance matrix?
         "python_dependencies": "dtw-python",
+        "python_dependencies_alias": {"dtw-python": "dtw"},
     }
 
     def __init__(
@@ -246,16 +245,6 @@ def __init__(
         open_begin=False,
         open_end=False,
     ):
-        """Construct instance."""
-        # added manually since dtw-python has an import alias
-        # default check from super.__init__ does not allow aliases
-        _check_soft_dependencies(
-            "dtw-python",
-            package_import_alias={"dtw-python": "dtw"},
-            severity="error",
-            obj=self,
-            suppress_import_stdout=True,
-        )
         super().__init__()
 
         self.dist_trafo = dist_trafo
diff --git a/sktime/alignment/lucky.py b/sktime/alignment/lucky.py
new file mode 100644
index 00000000000..5fb25afcd14
--- /dev/null
+++ b/sktime/alignment/lucky.py
@@ -0,0 +1,162 @@
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Lucky sequence alignment."""
+
+import numpy as np
+import pandas as pd
+
+from sktime.alignment.base import BaseAligner
+
+
+class AlignerLuckyDtw(BaseAligner):
+    """Alignment path based on lucky dynamic time warping distance.
+
+    This aligner returns the alignment path produced by the lucky time warping
+    distance [1]_.
+    Uses Euclidean distance for multivariate data.
+
+    Based on code by Krisztian A Buza's research group.
+
+    Parameters
+    ----------
+    window: int, optional (default=None)
+        Maximum distance between indices of aligned series, aka warping window.
+        If None, defaults to max(len(ts1), len(ts2)), i.e., no warping window.
+
+    References
+    ----------
+    ..[1] Stephan Spiegel, Brijnesh-Johannes Jain, and Sahin Albayrak.
+        Fast time series classification under lucky time warping distance.
+        Proceedings of the 29th Annual ACM Symposium on Applied Computing. 2014.
+    """
+
+    _tags = {
+        "capability:multiple-alignment": False,  # can align more than two sequences?
+        "capability:distance": True,  # does compute/return overall distance?
+        "capability:distance-matrix": True,  # does compute/return distance matrix?
+        "alignment_type": "full",  # does the aligner produce full or partial alignment
+    }
+
+    def __init__(self, window=None):
+        self.window = window
+
+        super().__init__()
+
+    def _fit(self, X, Z=None):
+        """Fit alignment given series/sequences to align.
+
+            core logic
+
+        Parameters
+        ----------
+        X: list of pd.DataFrame (sequence) of length n - panel of series to align
+        Z: pd.DataFrame with n rows, optional; metadata, row correspond to indices of X
+        """
+        window = self.window
+
+        ts1, ts2 = X
+        ts1 = ts1.values
+        ts2 = ts2.values
+
+        len_ts1 = len(ts1)
+        len_ts2 = len(ts2)
+
+        if window is None:
+            window = max(len_ts1, len_ts2)
+
+        def vec_dist(x):
+            return np.linalg.norm(x) ** 2
+
+        d = vec_dist(ts1[0] - ts2[0])
+
+        i = 0
+        j = 0
+        align_i = [i]
+        align_j = [j]
+
+        while i + 1 < len_ts1 or j + 1 < len_ts2:
+            d_best = np.inf
+
+            if i + 1 < len_ts1 and j + 1 < len_ts2:
+                d_best = vec_dist(ts1[i + 1] - ts2[j + 1])
+                new_i = i + 1
+                new_j = j + 1
+
+            if i + 1 < len_ts1 and abs(i + 1 - j) <= window:
+                d1 = vec_dist(ts1[i + 1] - ts2[j])
+                if d1 < d_best:
+                    d_best = d1
+                    new_i = i + 1
+                    new_j = j
+
+            if j + 1 < len_ts2 and abs(j + 1 - i) <= window:
+                d2 = vec_dist(ts1[i] - ts2[j + 1])
+                if d2 < d_best:
+                    d_best = d2
+                    new_i = i
+                    new_j = j + 1
+
+            d = d + d_best
+            i = new_i
+            j = new_j
+            align_i = align_i + [i]
+            align_j = align_j + [j]
+
+        self.align_i_ = align_i
+        self.align_j_ = align_j
+        self.dist_ = d
+
+        return self
+
+    def _get_alignment(self):
+        """Return alignment for sequences/series passed in fit (iloc indices).
+
+        Behaviour: returns an alignment for sequences in X passed to fit
+            model should be in fitted state, fitted model parameters read from self
+
+        Returns
+        -------
+        pd.DataFrame in alignment format, with columns 'ind'+str(i) for integer i
+            cols contain iloc index of X[i] mapped to alignment coordinate for alignment
+        """
+        align = pd.DataFrame({"ind0": self.align_i_, "ind1": self.align_j_})
+        return align
+
+    def _get_distance(self):
+        """Return overall distance of alignment.
+
+            core logic
+
+        Behaviour: returns overall distance corresponding to alignment
+            not all aligners will return or implement this (optional)
+        Accesses in self:
+            Fitted model attributes ending in "_".
+
+        Returns
+        -------
+        distance: float - overall distance between all elements of X passed to fit
+        """
+        return self.dist_
+
+    @classmethod
+    def get_test_params(cls, parameter_set="default"):
+        """Return testing parameter settings for the estimator.
+
+        Parameters
+        ----------
+        parameter_set : str, default="default"
+            Name of the set of test parameters to return, for use in tests. If no
+            special parameters are defined for a value, will return `"default"` set.
+            There are currently no reserved values for aligners.
+
+        Returns
+        -------
+        params : dict or list of dict, default = {}
+            Parameters to create testing instances of the class
+            Each dict are parameters to construct an "interesting" test instance, i.e.,
+            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
+            `create_test_instance` uses the first (or only) dictionary in `params`
+        """
+        params0 = {}
+        params1 = {"window": 3}
+
+        return [params0, params1]
diff --git a/sktime/dists_kernels/__init__.py b/sktime/dists_kernels/__init__.py
index 5f384da981c..30a588a16e0 100644
--- a/sktime/dists_kernels/__init__.py
+++ b/sktime/dists_kernels/__init__.py
@@ -9,6 +9,7 @@
 from sktime.dists_kernels.dtw import DtwDist
 from sktime.dists_kernels.dummy import ConstantPwTrafoPanel
 from sktime.dists_kernels.edit_dist import EditDist
+from sktime.dists_kernels.lucky import LuckyDtwDist
 from sktime.dists_kernels.scipy_dist import ScipyDist
 from sktime.dists_kernels.signature_kernel import SignatureKernel
 
@@ -17,6 +18,7 @@
     "BasePairwiseTransformerPanel",
     "AggrDist",
     "DtwDist",
+    "LuckyDtwDist",
     "EditDist",
     "FlatDist",
     "ScipyDist",
diff --git a/sktime/dists_kernels/base/_delegate.py b/sktime/dists_kernels/base/_delegate.py
new file mode 100644
index 00000000000..d69527f09c0
--- /dev/null
+++ b/sktime/dists_kernels/base/_delegate.py
@@ -0,0 +1,64 @@
+"""Delegator mixin that delegates all methods to wrapped transformer.
+
+Useful for building estimators where all but one or a few methods are delegated.
+For that purpose, inherit from this estimator and then override only the methods that
+are not delegated.
+"""
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+
+__author__ = ["fkiraly"]
+__all__ = ["_DelegatedPairwiseTransformerPanel"]
+
+from sktime.dists_kernels.base import BasePairwiseTransformerPanel
+
+
+class _DelegatedPairwiseTransformerPanel(BasePairwiseTransformerPanel):
+    """Delegator mixin that delegates all methods to wrapped transformer.
+
+    Delegates inner transformer methods to a wrapped estimator.
+        Wrapped estimator is value of attribute with name self._delegate_name.
+        By default, this is "estimator_", i.e., delegates to self.estimator_
+        To override delegation, override _delegate_name attribute in child class.
+
+    Delegates the following inner underscore methods:
+        _transform
+
+    Does NOT delegate get_params, set_params.
+        get_params, set_params will hence use one additional nesting level by default.
+
+    Does NOT delegate or copy tags, this should be done in a child class if required.
+    """
+
+    # attribute for _DelegatedBasePairwiseTransformerPanel, which then delegates
+    #     all non-overridden methods are same as of getattr(self, _delegate_name)
+    #     see further details in _DelegatedBasePairwiseTransformerPanel docstring
+    _delegate_name = "estimator_"
+
+    def _get_delegate(self):
+        return getattr(self, self._delegate_name)
+
+    def _transform(self, X, X2=None):
+        """Compute distance/kernel matrix.
+
+        private _transform containing core logic, called from transform
+
+        Behaviour: returns pairwise distance/kernel matrix
+            between samples in X and X2 (equal to X if not passed)
+
+        Parameters
+        ----------
+        X : guaranteed to be Series or Panel of mtype X_inner_mtype, n instances
+            if X_inner_mtype is list, _transform must support all types in it
+            Data to be transformed
+        X2 : guaranteed to be Series or Panel of mtype X_inner_mtype, m instances
+            if X_inner_mtype is list, _transform must support all types in it
+            Data to be transformed
+            default X2 = X
+
+        Returns
+        -------
+        distmat: np.array of shape [n, m]
+            (i,j)-th entry contains distance/kernel between X[i] and X2[j]
+        """
+        estimator = self._get_delegate()
+        return estimator.transform(X, X2=X2)
diff --git a/sktime/dists_kernels/dtw/__init__.py b/sktime/dists_kernels/dtw/__init__.py
index 549da5cc4fd..bc1fa6b4672 100644
--- a/sktime/dists_kernels/dtw/__init__.py
+++ b/sktime/dists_kernels/dtw/__init__.py
@@ -1,7 +1,8 @@
 # copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
 """Dynamic time warping distances."""
 
-__all__ = ["DtwDist", "DtwDistTslearn", "SoftDtwDistTslearn"]
+__all__ = ["DtwDist", "DtwPythonDist", "DtwDistTslearn", "SoftDtwDistTslearn"]
 
-from sktime.dists_kernels.dtw._dtw_python import DtwDist
+from sktime.dists_kernels.dtw._dtw_python import DtwPythonDist
+from sktime.dists_kernels.dtw._dtw_sktime import DtwDist
 from sktime.dists_kernels.dtw._dtw_tslearn import DtwDistTslearn, SoftDtwDistTslearn
diff --git a/sktime/dists_kernels/dtw/_dtw_python.py b/sktime/dists_kernels/dtw/_dtw_python.py
index 67d03860286..adc448fe74b 100644
--- a/sktime/dists_kernels/dtw/_dtw_python.py
+++ b/sktime/dists_kernels/dtw/_dtw_python.py
@@ -1,202 +1,86 @@
-"""BaseEstimator interface to sktime dtw distances in distances module."""
+"""BaseEstimator interface to dynamic time warping distances in dtw_python."""
 
 __author__ = ["fkiraly"]
 
-from typing import Union
+from sktime.dists_kernels.base._delegate import _DelegatedPairwiseTransformerPanel
 
-import numpy as np
 
-from sktime.distances import pairwise_distance
-from sktime.dists_kernels.base import BasePairwiseTransformerPanel
+class DtwPythonDist(_DelegatedPairwiseTransformerPanel):
+    r"""Interface to dynamic time warping distances in the dtw-python package.
 
-
-class DtwDist(BasePairwiseTransformerPanel):
-    r"""Interface to sktime native dtw distances, with derivative or weighting.
-
-    Interface to simple dynamic time warping (DTW) distance,
-    and the following weighted/derivative versions:
-
-    * WDTW - weighted dynamic tyme warping - ``weighted=True, derivative=False`
-    * DDTW - derivative dynamic time warping - ``weighted=False, derivative=True``
-    * WDDTW - weighted derivative dynamic time
-      warping - ``weighted=True, derivative=True``
-
-    `sktime` interface to the efficient `numba` implementations
-    provided by ``pairwise_distance`` in `sktime.distances`.
-
-    This estimator provides performant implementation of time warping distances for:
-    * time series of equal length
-    * the Euclidean pairwise distance
-
-    For unequal length time series, use ``sktime.dists_kernels.DistFromAligner``
-    with a time warping aligner such as ``sktime.aligners.AlignerDTW``.
-    To use arbitrary pairwise distances, use ``sktime.aligners.AlignerDTWfromDist``.
-    (for derivative DTW, pipeline an alignment distance with ``Differencer``)
-
-    Note that the more flexible options above may be less performant.
-
-    The algorithms are also available as alignment estimators
-    ``sktime.alignmnent.dtw_numba``, producing alignments aka alignment paths.
-
-    DTW was originally proposed in [1]_, DTW computes the distance between two
-    time series by considering their alignments during the calculation.
-    This is done by measuring
-    the pointwise distance (normally using Euclidean) between all elements of the two
-    time series and then using dynamic programming to find the warping path
-    that minimises the total pointwise distance between realigned series.
-
-    DDTW is an adaptation of DTW originally proposed in [2]_. DDTW attempts to
-    improve on dtw by better account for the 'shape' of the time series.
-    This is done by considering y axis data points as higher level features of 'shape'.
-    To do this the first derivative of the sequence is taken, and then using this
-    derived sequence a dtw computation is done.
-
-    WDTW was first proposed in [3]_, it adds a multiplicative weight penalty based on
-    the warping distance. This means that time series with lower phase difference have
-    a smaller weight imposed (i.e less penalty imposed) and time series with larger
-    phase difference have a larger weight imposed (i.e. larger penalty imposed).
-
-    WDDTW was first proposed in [3]_ as an extension of DDTW. By adding a weight
-    to the derivative it means the alignment isn't only considering the shape of the
-    time series, but also the phase.
+    Computes the dynamic time warping distance between series, using
+    the dtw-python package.
 
     Parameters
     ----------
-    weighted : bool, optional, default=False
-        whether a weighted version of the distance is computed
-        False = unmodified distance, i.e., dtw distance or derivative dtw distance
-        True = weighted distance, i.e., weighted dtw or derivative weighted dtw
-    derivative : bool, optional, default=False
-        whether the distance or the derivative distance is computed
-        False = unmodified distance, i.e., dtw distance or weighted dtw distance
-        True = derivative distance, i.e., derivative dtw distance or derivative wdtw
-    window: int, defaults = None
-        Sakoe-Chiba window radius
-        one of three mutually exclusive ways to specify bounding matrix
-        if ``None``, does not use Sakoe-Chiba window
-        if ``int``, uses Sakoe-Chiba lower bounding window with radius ``window``.
-        If ``window`` is passed, ``itakura_max_slope`` will be ignored.
-    itakura_max_slope: float, between 0. and 1., default = None
-        Itakura parallelogram slope
-        one of three mutually exclusive ways to specify bounding matrix
-        if ``None``, does not use Itakura parallelogram lower bounding
-        if ``float``, uses Itakura parallelogram lower bounding,
-        with slope gradient ``itakura_max_slope``
-    bounding_matrix: optional, 2D np.ndarray, default=None
-        one of three mutually exclusive ways to specify bounding matrix
-        must be of shape ``(len(X), len(X2))``, ``len`` meaning number time points,
-        where ``X``, ``X2`` are the two time series passed in transform
-        Custom bounding matrix to use.
-        If provided, then ``window`` and ``itakura_max_slope`` are ignored.
-        The matrix should be structured so that indexes considered in
-        bound should be the value 0. and indexes outside the bounding matrix should
-        be infinity.
-    g: float, optional, default = 0. Used only if ``weighted=True``.
-        Constant that controls the curvature (slope) of the function;
-        that is, ``g`` controls the level of penalisation for the points
-        with larger phase difference.
-
-    References
-    ----------
-    .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for
-           spoken word recognition," IEEE Transactions on Acoustics, Speech and
-           Signal Processing, vol. 26(1), pp. 43--49, 1978.
-    .. [2] Keogh, Eamonn & Pazzani, Michael. (2002). Derivative Dynamic Time Warping.
-        First SIAM International Conference on Data Mining.
-        1. 10.1137/1.9781611972719.1.
-    .. [3] Young-Seon Jeong, Myong K. Jeong, Olufemi A. Omitaomu, Weighted dynamic time
-    warping for time series classification, Pattern Recognition, Volume 44, Issue 9,
-    2011, Pages 2231-2240, ISSN 0031-3203, https://doi.org/10.1016/j.patcog.2010.09.022.
-
-    Examples
-    --------
-    >>> from sktime.datasets import load_unit_test
-    >>> from sktime.dists_kernels.dtw import DtwDist
-    >>>
-    >>> X, _ = load_unit_test(return_type="pd-multiindex")  # doctest: +SKIP
-    >>> d = DtwDist(weighted=True, derivative=True)  # doctest: +SKIP
-    >>> distmat = d.transform(X)  # doctest: +SKIP
-
-    distances are also callable, this does the same:
-
-    >>> distmat = d(X)  # doctest: +SKIP
+    dist: str, or estimator following sktime BasePairwiseTransformer API
+        distance to use, a distance on real n-space, default = "euclidean"
+        if str, must be name of one of the functions in `scipy.spatial.distance.cdist`
+        if estimator, must follow sktime BasePairwiseTransformer API
+    step_pattern : str, optional, default = "symmetric2",
+        or dtw_python stepPattern object, optional
+        step pattern to use in time warping
+        one of: 'symmetric1', 'symmetric2' (default), 'asymmetric',
+        and dozens of other more non-standard step patterns;
+        list can be displayed by calling help(stepPattern) in dtw
+    window_type: str  optional, default = "none"
+        the chosen windowing function
+        "none", "itakura", "sakoechiba", or "slantedband"
+        "none" (default) - no windowing
+        "sakoechiba" - a band around main diagonal
+        "slantedband" - a band around slanted diagonal
+        "itakura" - Itakura parallelogram
+    open_begin : boolean, optional, default=False
+    open_end: boolean, optional, default=False
+        whether to perform open-ended alignments
+        open_begin = whether alignment open ended at start (low index)
+        open_end = whether alignment open ended at end (high index)
     """
 
     _tags = {
+        "pwtrafo_type": "distance",  # type of pw. transformer, "kernel" or "distance"
         "symmetric": True,  # all the distances are symmetric
-        "X_inner_mtype": "numpy3D",
-        "python_dependencies": "numba",
+        "capability:multivariate": True,  # can estimator handle multivariate data?
+        "capability:unequal_length": True,  # can dist handle unequal length panels?
+        "X_inner_mtype": "df-list",
+        "python_dependencies": "dtw-python",
+        "python_dependencies_alias": {"dtw-python": "dtw"},
     }
 
     def __init__(
         self,
-        weighted: bool = False,
-        derivative: bool = False,
-        window: Union[int, None] = None,
-        itakura_max_slope: Union[float, None] = None,
-        bounding_matrix: np.ndarray = None,
-        g: float = 0.0,
+        dist="euclidean",
+        step_pattern="symmetric2",
+        window_type="none",
+        open_begin=False,
+        open_end=False,
     ):
-        self.weighted = weighted
-        self.derivative = derivative
-        self.window = window
-        self.itakura_max_slope = itakura_max_slope
-        self.bounding_matrix = bounding_matrix
-        self.g = g
-
-        if not weighted and not derivative:
-            metric_key = "dtw"
-        elif not weighted and derivative:
-            metric_key = "ddtw"
-        elif weighted and not derivative:
-            metric_key = "wdtw"
-        elif weighted and derivative:
-            metric_key = "wddtw"
-
-        self.metric_key = metric_key
-
-        kwargs = {
-            "window": window,
-            "itakura_max_slope": itakura_max_slope,
-            "bounding_matrix": bounding_matrix,
-        }
-
-        # g is used only for weighted dtw
-        if weighted:
-            kwargs["g"] = g
-
-        self.kwargs = kwargs
+        self.dist = dist
+        self.step_pattern = step_pattern
+        self.window_type = window_type
+        self.open_begin = open_begin
+        self.open_end = open_end
 
         super().__init__()
 
-    def _transform(self, X, X2=None):
-        """Compute distance/kernel matrix.
-
-            Core logic
-
-        Behaviour: returns pairwise distance/kernel matrix
-            between samples in X and X2
-                if X2 is not passed, is equal to X
-                if X/X2 is a pd.DataFrame and contains non-numeric columns,
-                    these are removed before computation
-
-        Parameters
-        ----------
-        X: 3D np.array of shape [num_instances, num_vars, num_time_points]
-        X2: 3D np.array of shape [num_instances, num_vars, num_time_points], optional
-            default X2 = X
+        params = {
+            "step_pattern": step_pattern,
+            "window_type": window_type,
+            "open_begin": open_begin,
+            "open_end": open_end,
+        }
 
-        Returns
-        -------
-        distmat: np.array of shape [n, m]
-            (i,j)-th entry contains distance/kernel between X[i] and X2[j]
-        """
-        metric_key = self.metric_key
-        kwargs = self.kwargs
+        from sktime.alignment.dtw_python import AlignerDTW, AlignerDTWfromDist
+        from sktime.dists_kernels.compose_from_align import DistFromAligner
 
-        distmat = pairwise_distance(X, X2, metric=metric_key, **kwargs)
+        if isinstance(dist, str):
+            params["dist_method"] = dist
+            delegate = DistFromAligner(AlignerDTW(**params))
+        else:
+            params["dist_trafo"] = dist
+            delegate = DistFromAligner(AlignerDTWfromDist(**params))
 
-        return distmat
+        self.estimator_ = delegate
 
     @classmethod
     def get_test_params(cls, parameter_set="default"):
@@ -217,9 +101,11 @@ def get_test_params(cls, parameter_set="default"):
             `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
             `create_test_instance` uses the first (or only) dictionary in `params`
         """
+        from sktime.dists_kernels import ScipyDist
+
         params0 = {}
-        params1 = {"weighted": True}
-        params2 = {"derivative": True, "window": 0.2}
-        params3 = {"weighted": True, "derivative": True, "g": 0.05}
+        params1 = {"dist": "cityblock"}
+        params2 = {"dist": ScipyDist()}
+        params3 = {"dist": ScipyDist("cityblock"), "step_pattern": "symmetric1"}
 
         return [params0, params1, params2, params3]
diff --git a/sktime/dists_kernels/dtw/_dtw_sktime.py b/sktime/dists_kernels/dtw/_dtw_sktime.py
new file mode 100644
index 00000000000..67d03860286
--- /dev/null
+++ b/sktime/dists_kernels/dtw/_dtw_sktime.py
@@ -0,0 +1,225 @@
+"""BaseEstimator interface to sktime dtw distances in distances module."""
+
+__author__ = ["fkiraly"]
+
+from typing import Union
+
+import numpy as np
+
+from sktime.distances import pairwise_distance
+from sktime.dists_kernels.base import BasePairwiseTransformerPanel
+
+
+class DtwDist(BasePairwiseTransformerPanel):
+    r"""Interface to sktime native dtw distances, with derivative or weighting.
+
+    Interface to simple dynamic time warping (DTW) distance,
+    and the following weighted/derivative versions:
+
+    * WDTW - weighted dynamic tyme warping - ``weighted=True, derivative=False`
+    * DDTW - derivative dynamic time warping - ``weighted=False, derivative=True``
+    * WDDTW - weighted derivative dynamic time
+      warping - ``weighted=True, derivative=True``
+
+    `sktime` interface to the efficient `numba` implementations
+    provided by ``pairwise_distance`` in `sktime.distances`.
+
+    This estimator provides performant implementation of time warping distances for:
+    * time series of equal length
+    * the Euclidean pairwise distance
+
+    For unequal length time series, use ``sktime.dists_kernels.DistFromAligner``
+    with a time warping aligner such as ``sktime.aligners.AlignerDTW``.
+    To use arbitrary pairwise distances, use ``sktime.aligners.AlignerDTWfromDist``.
+    (for derivative DTW, pipeline an alignment distance with ``Differencer``)
+
+    Note that the more flexible options above may be less performant.
+
+    The algorithms are also available as alignment estimators
+    ``sktime.alignmnent.dtw_numba``, producing alignments aka alignment paths.
+
+    DTW was originally proposed in [1]_, DTW computes the distance between two
+    time series by considering their alignments during the calculation.
+    This is done by measuring
+    the pointwise distance (normally using Euclidean) between all elements of the two
+    time series and then using dynamic programming to find the warping path
+    that minimises the total pointwise distance between realigned series.
+
+    DDTW is an adaptation of DTW originally proposed in [2]_. DDTW attempts to
+    improve on dtw by better account for the 'shape' of the time series.
+    This is done by considering y axis data points as higher level features of 'shape'.
+    To do this the first derivative of the sequence is taken, and then using this
+    derived sequence a dtw computation is done.
+
+    WDTW was first proposed in [3]_, it adds a multiplicative weight penalty based on
+    the warping distance. This means that time series with lower phase difference have
+    a smaller weight imposed (i.e less penalty imposed) and time series with larger
+    phase difference have a larger weight imposed (i.e. larger penalty imposed).
+
+    WDDTW was first proposed in [3]_ as an extension of DDTW. By adding a weight
+    to the derivative it means the alignment isn't only considering the shape of the
+    time series, but also the phase.
+
+    Parameters
+    ----------
+    weighted : bool, optional, default=False
+        whether a weighted version of the distance is computed
+        False = unmodified distance, i.e., dtw distance or derivative dtw distance
+        True = weighted distance, i.e., weighted dtw or derivative weighted dtw
+    derivative : bool, optional, default=False
+        whether the distance or the derivative distance is computed
+        False = unmodified distance, i.e., dtw distance or weighted dtw distance
+        True = derivative distance, i.e., derivative dtw distance or derivative wdtw
+    window: int, defaults = None
+        Sakoe-Chiba window radius
+        one of three mutually exclusive ways to specify bounding matrix
+        if ``None``, does not use Sakoe-Chiba window
+        if ``int``, uses Sakoe-Chiba lower bounding window with radius ``window``.
+        If ``window`` is passed, ``itakura_max_slope`` will be ignored.
+    itakura_max_slope: float, between 0. and 1., default = None
+        Itakura parallelogram slope
+        one of three mutually exclusive ways to specify bounding matrix
+        if ``None``, does not use Itakura parallelogram lower bounding
+        if ``float``, uses Itakura parallelogram lower bounding,
+        with slope gradient ``itakura_max_slope``
+    bounding_matrix: optional, 2D np.ndarray, default=None
+        one of three mutually exclusive ways to specify bounding matrix
+        must be of shape ``(len(X), len(X2))``, ``len`` meaning number time points,
+        where ``X``, ``X2`` are the two time series passed in transform
+        Custom bounding matrix to use.
+        If provided, then ``window`` and ``itakura_max_slope`` are ignored.
+        The matrix should be structured so that indexes considered in
+        bound should be the value 0. and indexes outside the bounding matrix should
+        be infinity.
+    g: float, optional, default = 0. Used only if ``weighted=True``.
+        Constant that controls the curvature (slope) of the function;
+        that is, ``g`` controls the level of penalisation for the points
+        with larger phase difference.
+
+    References
+    ----------
+    .. [1] H. Sakoe, S. Chiba, "Dynamic programming algorithm optimization for
+           spoken word recognition," IEEE Transactions on Acoustics, Speech and
+           Signal Processing, vol. 26(1), pp. 43--49, 1978.
+    .. [2] Keogh, Eamonn & Pazzani, Michael. (2002). Derivative Dynamic Time Warping.
+        First SIAM International Conference on Data Mining.
+        1. 10.1137/1.9781611972719.1.
+    .. [3] Young-Seon Jeong, Myong K. Jeong, Olufemi A. Omitaomu, Weighted dynamic time
+    warping for time series classification, Pattern Recognition, Volume 44, Issue 9,
+    2011, Pages 2231-2240, ISSN 0031-3203, https://doi.org/10.1016/j.patcog.2010.09.022.
+
+    Examples
+    --------
+    >>> from sktime.datasets import load_unit_test
+    >>> from sktime.dists_kernels.dtw import DtwDist
+    >>>
+    >>> X, _ = load_unit_test(return_type="pd-multiindex")  # doctest: +SKIP
+    >>> d = DtwDist(weighted=True, derivative=True)  # doctest: +SKIP
+    >>> distmat = d.transform(X)  # doctest: +SKIP
+
+    distances are also callable, this does the same:
+
+    >>> distmat = d(X)  # doctest: +SKIP
+    """
+
+    _tags = {
+        "symmetric": True,  # all the distances are symmetric
+        "X_inner_mtype": "numpy3D",
+        "python_dependencies": "numba",
+    }
+
+    def __init__(
+        self,
+        weighted: bool = False,
+        derivative: bool = False,
+        window: Union[int, None] = None,
+        itakura_max_slope: Union[float, None] = None,
+        bounding_matrix: np.ndarray = None,
+        g: float = 0.0,
+    ):
+        self.weighted = weighted
+        self.derivative = derivative
+        self.window = window
+        self.itakura_max_slope = itakura_max_slope
+        self.bounding_matrix = bounding_matrix
+        self.g = g
+
+        if not weighted and not derivative:
+            metric_key = "dtw"
+        elif not weighted and derivative:
+            metric_key = "ddtw"
+        elif weighted and not derivative:
+            metric_key = "wdtw"
+        elif weighted and derivative:
+            metric_key = "wddtw"
+
+        self.metric_key = metric_key
+
+        kwargs = {
+            "window": window,
+            "itakura_max_slope": itakura_max_slope,
+            "bounding_matrix": bounding_matrix,
+        }
+
+        # g is used only for weighted dtw
+        if weighted:
+            kwargs["g"] = g
+
+        self.kwargs = kwargs
+
+        super().__init__()
+
+    def _transform(self, X, X2=None):
+        """Compute distance/kernel matrix.
+
+            Core logic
+
+        Behaviour: returns pairwise distance/kernel matrix
+            between samples in X and X2
+                if X2 is not passed, is equal to X
+                if X/X2 is a pd.DataFrame and contains non-numeric columns,
+                    these are removed before computation
+
+        Parameters
+        ----------
+        X: 3D np.array of shape [num_instances, num_vars, num_time_points]
+        X2: 3D np.array of shape [num_instances, num_vars, num_time_points], optional
+            default X2 = X
+
+        Returns
+        -------
+        distmat: np.array of shape [n, m]
+            (i,j)-th entry contains distance/kernel between X[i] and X2[j]
+        """
+        metric_key = self.metric_key
+        kwargs = self.kwargs
+
+        distmat = pairwise_distance(X, X2, metric=metric_key, **kwargs)
+
+        return distmat
+
+    @classmethod
+    def get_test_params(cls, parameter_set="default"):
+        """Return testing parameter settings for the estimator.
+
+        Parameters
+        ----------
+        parameter_set : str, default="default"
+            Name of the set of test parameters to return, for use in tests. If no
+            special parameters are defined for a value, will return `"default"` set.
+            There are currently no reserved values for distance/kernel transformers.
+
+        Returns
+        -------
+        params : dict or list of dict, default = {}
+            Parameters to create testing instances of the class
+            Each dict are parameters to construct an "interesting" test instance, i.e.,
+            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
+            `create_test_instance` uses the first (or only) dictionary in `params`
+        """
+        params0 = {}
+        params1 = {"weighted": True}
+        params2 = {"derivative": True, "window": 0.2}
+        params3 = {"weighted": True, "derivative": True, "g": 0.05}
+
+        return [params0, params1, params2, params3]
diff --git a/sktime/dists_kernels/lucky.py b/sktime/dists_kernels/lucky.py
new file mode 100644
index 00000000000..00ed51605f9
--- /dev/null
+++ b/sktime/dists_kernels/lucky.py
@@ -0,0 +1,68 @@
+# copyright: sktime developers, BSD-3-Clause License (see LICENSE file)
+"""Lucky dynamic time warping distance."""
+
+from sktime.dists_kernels.base._delegate import _DelegatedPairwiseTransformerPanel
+
+
+class LuckyDtwDist(_DelegatedPairwiseTransformerPanel):
+    """Lucky dynamic time warping distance.
+
+    Implements lucky dynamic time warping distance [1]_.
+    Uses Euclidean distance for multivariate data.
+
+    Based on code by Krisztian A Buza's research group.
+
+    Parameters
+    ----------
+    window: int, optional (default=None)
+        Maximum distance between indices of aligned series, aka warping window.
+        If None, defaults to max(len(ts1), len(ts2)), i.e., no warping window.
+
+    References
+    ----------
+    ..[1] Stephan Spiegel, Brijnesh-Johannes Jain, and Sahin Albayrak.
+        Fast time series classification under lucky time warping distance.
+        Proceedings of the 29th Annual ACM Symposium on Applied Computing. 2014.
+    """
+
+    _tags = {
+        "symmetric": True,  # is the transformer symmetric, i.e., t(x,y)=t(y,x) always?
+        "capability:missing_values": False,  # can estimator handle missing data?
+        "capability:multivariate": True,  # can estimator handle multivariate data?
+        "capability:unequal_length": True,  # can dist handle unequal length panels?
+        "pwtrafo_type": "distance",  # type of pw. transformer, "kernel" or "distance"
+    }
+
+    def __init__(self, window=None):
+        self.window = window
+
+        super().__init__()
+
+        from sktime.alignment.lucky import AlignerLuckyDtw
+        from sktime.dists_kernels.compose_from_align import DistFromAligner
+
+        self.estimator_ = DistFromAligner(AlignerLuckyDtw(window=window))
+
+    @classmethod
+    def get_test_params(cls, parameter_set="default"):
+        """Return testing parameter settings for the estimator.
+
+        Parameters
+        ----------
+        parameter_set : str, default="default"
+            Name of the set of test parameters to return, for use in tests. If no
+            special parameters are defined for a value, will return `"default"` set.
+            There are currently no reserved values for distance/kernel transformers.
+
+        Returns
+        -------
+        params : dict or list of dict, default = {}
+            Parameters to create testing instances of the class
+            Each dict are parameters to construct an "interesting" test instance, i.e.,
+            `MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
+            `create_test_instance` uses the first (or only) dictionary in `params`
+        """
+        params0 = {}
+        params1 = {"window": 4}
+
+        return [params0, params1]
diff --git a/sktime/forecasting/base/adapters/_generalised_statsforecast.py b/sktime/forecasting/base/adapters/_generalised_statsforecast.py
index 7ac8db8dbf2..3055b7a0449 100644
--- a/sktime/forecasting/base/adapters/_generalised_statsforecast.py
+++ b/sktime/forecasting/base/adapters/_generalised_statsforecast.py
@@ -43,30 +43,77 @@ def __init__(self):
     def _get_statsforecast_class(self):
         raise NotImplementedError("abstract method")
 
-    def _get_statsforecast_params(self):
+    def _get_statsforecast_params(self) -> dict:
         return self.get_params()
 
     def _get_init_statsforecast_params(self):
+        """Return parameters in __init__ statsforecast forecaster.
+
+        Return a list of parameters in the __init__ method from
+        the statsforecast forecaster class used in the sktime adapter.
+        """
         statsforecast_class = self._get_statsforecast_class()
         return list(signature(statsforecast_class.__init__).parameters.keys())
 
-    def _get_validated_statsforecast_params(self):
-        sktime_params = self._get_statsforecast_params()
-        sktime_default_params = self.get_param_defaults().keys()
-        statsforecast_params = self._get_init_statsforecast_params()
-
-        for sktime_param in sktime_params.keys():
-            if sktime_param not in statsforecast_params:
-                sktime_params.pop(sktime_param)
-                if sktime_param not in sktime_default_params:
-                    warn(
-                        f"Keyword argument '{sktime_param}' will be omitted as it is"
-                        f" not found in the __init__ method "
-                        f"from {self._get_statsforecast_class()}. "
-                        f"Check your statsforecast version"
-                        f"to find out the right API parameters."
-                    )
-        return sktime_params
+    def _get_statsforecast_default_params(self) -> dict:
+        """Get default parameters for the statsforecast forecaster.
+
+        This will in general be different from self.get_param_defaults(),
+        as the set or names of inner parameters can differ.
+
+        For parameters without defaults, will use the parameter
+        of self instead.
+        """
+        self_params = self.get_params(deep=False)
+        self_default_params = self.get_param_defaults()
+        self_params.update(self_default_params)
+        cls_with_defaults = type(self)(**self_params)
+        return cls_with_defaults._get_statsforecast_params()
+
+    def _get_validated_statsforecast_params(self) -> dict:
+        """Return parameter dict with only parameters accepted by statsforecast API.
+
+        Checks if the parameters passed to the statsforecast forecaster
+        are valid in the __init__ method of the aforementioned forecaster.
+        If the parameter is not there it will just not be passed. Furthermore
+        if the parameter is modified by the sktime user,
+        he will be notified that the parameter does not exist
+        anymore in the version installed of statsforecast by the user.
+
+        """
+        params_sktime_to_statsforecast: dict = self._get_statsforecast_params()
+        params_sktime_to_statsforecast_default: dict = (
+            self._get_statsforecast_default_params()
+        )
+        statsforecast_init_params = set(self._get_init_statsforecast_params())
+
+        # Filter sktime_params to only include keys in statsforecast_params
+        filtered_sktime_params = {
+            key: value
+            for key, value in params_sktime_to_statsforecast.items()
+            if key in statsforecast_init_params
+        }
+
+        non_default_params = [
+            p
+            for p in params_sktime_to_statsforecast
+            if params_sktime_to_statsforecast[p]
+            != params_sktime_to_statsforecast_default[p]
+        ]
+        # Find parameters not in statsforecast_params or sktime_default_params
+        param_diff = set(non_default_params) - statsforecast_init_params
+
+        if param_diff:
+            params_str = ", ".join([f'"{param}"' for param in param_diff])
+            warning_message = (
+                f"Keyword arguments {params_str} "
+                f"will be omitted as they are not found in the __init__ method from "
+                f"{self._get_statsforecast_class()}. Check your statsforecast version "
+                f"to find out the right API parameters."
+            )
+            warn(warning_message)
+
+        return filtered_sktime_params
 
     def _instantiate_model(self):
         cls = self._get_statsforecast_class()
@@ -400,10 +447,17 @@ def __init__(self, estimator):
         super().__init__()
 
         self.estimator = estimator
+        self.prediction_intervals = None
 
     def __repr__(self):
         return "StatsForecastBackAdapter"
 
+    def new(self):
+        """Make new instance of back-adapter."""
+        _self = type(self).__new__(type(self))
+        _self.__dict__.update(self.__dict__)
+        return _self
+
     def fit(self, y, X=None):
         """Fit to training data.
 
@@ -442,6 +496,12 @@ def predict(self, h, X=None, level=None):
         mean = self.estimator.predict(fh=range(1, h + 1), X=X)[:, 0]
         if level is None:
             return {"mean": mean}
+        # if a level is passed, and if prediction_intervals has not been instantiated
+        # yet
+        elif self.prediction_intervals is None:
+            from statsforecast.utils import ConformalIntervals
+
+            self.prediction_intervals = ConformalIntervals(h=h)
 
         level = sorted(level)
         coverage = [round(1 - (_l / 100), 2) for _l in level]
diff --git a/sktime/forecasting/compose/_reduce.py b/sktime/forecasting/compose/_reduce.py
index 4ccc3fe9e30..1a3b5fa6795 100644
--- a/sktime/forecasting/compose/_reduce.py
+++ b/sktime/forecasting/compose/_reduce.py
@@ -2453,6 +2453,15 @@ class YfromX(BaseForecaster, _ReducerMixin):
     >>> f.fit(y=y_train, X=X_train, fh=fh)
     YfromX(...)
     >>> y_pred = f.predict(X=X_test)
+
+    YfromX can also be used with skpro probabilistic regressors,
+    in this case the resulting forecaster will be capable of probabilistic forecasts:
+    >>> from skpro.regression.residual import ResidualDouble  # doctest: +SKIP
+    >>> reg_proba = ResidualDouble(LinearRegression())  # doctest: +SKIP
+    >>> f = YfromX(reg_proba)  # doctest: +SKIP
+    >>> f.fit(y=y_train, X=X_train, fh=fh)  # doctest: +SKIP
+    YfromX(...)
+    >>> y_pred = f.predict_interval(X=X_test)  # doctest: +SKIP
     """
 
     _tags = {
@@ -2461,6 +2470,7 @@ class YfromX(BaseForecaster, _ReducerMixin):
         "handles-missing-data": True,
         "X_inner_mtype": ["pd.DataFrame", "pd-multiindex", "pd_multiindex_hier"],
         "y_inner_mtype": ["pd.DataFrame", "pd-multiindex", "pd_multiindex_hier"],
+        "capability:pred_int": True,
     }
 
     def __init__(self, estimator, pooling="local"):
@@ -2468,6 +2478,23 @@ def __init__(self, estimator, pooling="local"):
         self.pooling = pooling
         super().__init__()
 
+        # self._est_type encodes information what type of estimator is passed
+        if hasattr(estimator, "get_tags"):
+            _est_type = estimator.get_tag("object_type", "regressor", False)
+        else:
+            _est_type = "regressor"
+
+        if _est_type not in ["regressor", "regressor_proba"]:
+            raise TypeError(
+                "error in YfromX, estimator must be either an sklearn compatible "
+                "regressor, or an skpro probabilistic regressor."
+            )
+
+        # has probabilistic mode iff the estimator is of type regressor_proba
+        self.set_tags(**{"capability:pred_int": _est_type == "regressor_proba"})
+
+        self._est_type = _est_type
+
         if pooling == "local":
             mtypes = "pd.DataFrame"
         elif pooling == "global":
@@ -2505,17 +2532,27 @@ def _fit(self, y, X, fh):
         -------
         self : reference to self
         """
+        _est_type = self._est_type
+
         if X is None:
             from sklearn.dummy import DummyRegressor
 
+            if _est_type == "regressor":
+                estimator = DummyRegressor()
+            else:  # "proba_regressor"
+                from skpro.regression.residual import ResidualDouble
+
+                dummy = DummyRegressor()
+                estimator = ResidualDouble(dummy)
+
             X = _coerce_col_str(y)
-            estimator = DummyRegressor()
         else:
             X = _coerce_col_str(X)
             estimator = clone(self.estimator)
 
-        y = _coerce_col_str(y)
-        y = y.values.flatten()
+        if _est_type == "regressor":
+            y = _coerce_col_str(y)
+            y = y.values.flatten()
 
         estimator.fit(X, y)
         self.estimator_ = estimator
@@ -2541,7 +2578,173 @@ def _predict(self, X=None, fh=None):
         y_pred : pd.DataFrame, same type as y in _fit
             Point predictions
         """
+        _est_type = self._est_type
+
+        fh_idx = self._get_expected_pred_idx(fh=fh)
+
+        X_idx = self._get_pred_X(X=X, fh_idx=fh_idx)
+        y_pred = self.estimator_.predict(X_idx)
+
+        if _est_type == "regressor":
+            y_cols = self._y.columns
+            y_pred = pd.DataFrame(y_pred, index=fh_idx, columns=y_cols)
+
+        return y_pred
+
+    def _predict_quantiles(self, fh, X, alpha):
+        """Compute/return prediction quantiles for a forecast.
+
+        private _predict_quantiles containing the core logic,
+            called from predict_quantiles and possibly predict_interval
+
+        State required:
+            Requires state to be "fitted".
+
+        Accesses in self:
+            Fitted model attributes ending in "_"
+            self.cutoff
+
+        Parameters
+        ----------
+        fh : guaranteed to be ForecastingHorizon
+            The forecasting horizon with the steps ahead to to predict.
+        X :  sktime time series object, optional (default=None)
+            guaranteed to be of an mtype in self.get_tag("X_inner_mtype")
+            Exogeneous time series for the forecast
+        alpha : list of float (guaranteed not None and floats in [0,1] interval)
+            A list of probabilities at which quantile forecasts are computed.
+
+        Returns
+        -------
+        quantiles : pd.DataFrame
+            Column has multi-index: first level is variable name from y in fit,
+                second level being the values of alpha passed to the function.
+            Row index is fh, with additional (upper) levels equal to instance levels,
+                    from y seen in fit, if y_inner_mtype is Panel or Hierarchical.
+            Entries are quantile forecasts, for var in col index,
+                at quantile probability in second col index, for the row index.
+        """
+        fh_idx = self._get_expected_pred_idx(fh=fh)
+        X_idx = self._get_pred_X(X=X, fh_idx=fh_idx)
+        y_pred = self.estimator_.predict_quantiles(X_idx, alpha=alpha)
+        return y_pred
+
+    def _predict_interval(self, fh, X, coverage):
+        """Compute/return prediction quantiles for a forecast.
+
+        private _predict_interval containing the core logic,
+            called from predict_interval and possibly predict_quantiles
+
+        State required:
+            Requires state to be "fitted".
+
+        Accesses in self:
+            Fitted model attributes ending in "_"
+            self.cutoff
+
+        Parameters
+        ----------
+        fh : guaranteed to be ForecastingHorizon
+            The forecasting horizon with the steps ahead to to predict.
+        X :  sktime time series object, optional (default=None)
+            guaranteed to be of an mtype in self.get_tag("X_inner_mtype")
+            Exogeneous time series for the forecast
+        coverage : list of float (guaranteed not None and floats in [0,1] interval)
+           nominal coverage(s) of predictive interval(s)
+
+        Returns
+        -------
+        pred_int : pd.DataFrame
+            Column has multi-index: first level is variable name from y in fit,
+                second level coverage fractions for which intervals were computed.
+                    in the same order as in input `coverage`.
+                Third level is string "lower" or "upper", for lower/upper interval end.
+            Row index is fh, with additional (upper) levels equal to instance levels,
+                from y seen in fit, if y_inner_mtype is Panel or Hierarchical.
+            Entries are forecasts of lower/upper interval end,
+                for var in col index, at nominal coverage in second col index,
+                lower/upper depending on third col index, for the row index.
+                Upper/lower interval end forecasts are equivalent to
+                quantile forecasts at alpha = 0.5 - c/2, 0.5 + c/2 for c in coverage.
+        """
+        fh_idx = self._get_expected_pred_idx(fh=fh)
+        X_idx = self._get_pred_X(X=X, fh_idx=fh_idx)
+        y_pred = self.estimator_.predict_interval(X_idx, coverage=coverage)
+        return y_pred
+
+    def _predict_var(self, fh, X=None, cov=False):
+        """Forecast variance at future horizon.
+
+        private _predict_var containing the core logic, called from predict_var
+
+        Parameters
+        ----------
+        fh : guaranteed to be ForecastingHorizon or None, optional (default=None)
+            The forecasting horizon with the steps ahead to to predict.
+            If not passed in _fit, guaranteed to be passed here
+        X :  sktime time series object, optional (default=None)
+            guaranteed to be of an mtype in self.get_tag("X_inner_mtype")
+            Exogeneous time series for the forecast
+        cov : bool, optional (default=False)
+            if True, computes covariance matrix forecast.
+            if False, computes marginal variance forecasts.
+
+        Returns
+        -------
+        pred_var : pd.DataFrame, format dependent on `cov` variable
+            If cov=False:
+                Column names are exactly those of `y` passed in `fit`/`update`.
+                    For nameless formats, column index will be a RangeIndex.
+                Row index is fh, with additional levels equal to instance levels,
+                    from y seen in fit, if y_inner_mtype is Panel or Hierarchical.
+                Entries are variance forecasts, for var in col index.
+                A variance forecast for given variable and fh index is a predicted
+                    variance for that variable and index, given observed data.
+            If cov=True:
+                Column index is a multiindex: 1st level is variable names (as above)
+                    2nd level is fh.
+                Row index is fh, with additional levels equal to instance levels,
+                    from y seen in fit, if y_inner_mtype is Panel or Hierarchical.
+                Entries are (co-)variance forecasts, for var in col index, and
+                    covariance between time index in row and col.
+                Note: no covariance forecasts are returned between different variables.
+        """
+        fh_idx = self._get_expected_pred_idx(fh=fh)
+        X_idx = self._get_pred_X(X=X, fh_idx=fh_idx)
+        y_pred = self.estimator_.predict_var(X_idx)
+        return y_pred
+
+    def _predict_proba(self, fh, X, marginal=True):
+        """Compute/return fully probabilistic forecasts.
+
+        private _predict_proba containing the core logic, called from predict_proba
+
+        Parameters
+        ----------
+        fh : int, list, np.array or ForecastingHorizon (not optional)
+            The forecasting horizon encoding the time stamps to forecast at.
+            if has not been passed in fit, must be passed, not optional
+        X : sktime time series object, optional (default=None)
+                Exogeneous time series for the forecast
+            Should be of same scitype (Series, Panel, or Hierarchical) as y in fit
+            if self.get_tag("X-y-must-have-same-index"),
+                X.index must contain fh.index and y.index both
+        marginal : bool, optional (default=True)
+            whether returned distribution is marginal by time index
+
+        Returns
+        -------
+        pred_dist : sktime BaseDistribution
+            predictive distribution
+            if marginal=True, will be marginal distribution by time point
+            if marginal=False and implemented by method, will be joint
+        """
         fh_idx = self._get_expected_pred_idx(fh=fh)
+        X_idx = self._get_pred_X(X=X, fh_idx=fh_idx)
+        y_pred = self.estimator_.predict_proba(X_idx)
+        return y_pred
+
+    def _get_pred_X(self, X, fh_idx):
         y_cols = self._y.columns
 
         if X is not None and self._X is not None:
@@ -2556,11 +2759,7 @@ def _predict(self, X=None, fh=None):
         X_pool = _coerce_col_str(X_pool)
 
         X_idx = X_pool.loc[fh_idx]
-
-        y_pred = self.estimator_.predict(X_idx)
-        y_pred = pd.DataFrame(y_pred, index=fh_idx, columns=y_cols)
-
-        return y_pred
+        return X_idx
 
     @classmethod
     def get_test_params(cls, parameter_set="default"):
@@ -2583,6 +2782,8 @@ def get_test_params(cls, parameter_set="default"):
         from sklearn.ensemble import RandomForestRegressor
         from sklearn.linear_model import LinearRegression
 
+        from sktime.utils.validation._dependencies import _check_soft_dependencies
+
         params1 = {
             "estimator": LinearRegression(),
             "pooling": "local",
@@ -2593,4 +2794,15 @@ def get_test_params(cls, parameter_set="default"):
             "pooling": "global",  # all internal mtypes are tested across scenarios
         }
 
-        return [params1, params2]
+        params = [params1, params2]
+
+        if _check_soft_dependencies("skpro", severity="none"):
+            from skpro.regression.residual import ResidualDouble
+
+            params3 = {
+                "estimator": ResidualDouble.create_test_instance(),
+                "pooling": "global",
+            }
+            params = params + [params3]
+
+        return params
diff --git a/sktime/forecasting/statsforecast.py b/sktime/forecasting/statsforecast.py
index e1a639f14dc..21493ab634d 100644
--- a/sktime/forecasting/statsforecast.py
+++ b/sktime/forecasting/statsforecast.py
@@ -595,6 +595,12 @@ class StatsForecastMSTL(_GeneralisedStatsForecastAdapter):
     trend_forecaster : estimator, optional, default=StatsForecastAutoETS()
         Sktime estimator used to make univariate forecasts. Multivariate estimators are
         not supported.
+    stl_kwargs : dict, optional
+        Extra arguments to pass to [`statsmodels.tsa.seasonal.STL`]
+        (https://www.statsmodels.org/dev/generated/statsmodels.tsa.seasonal.STL.html#statsmodels.tsa.seasonal.STL).
+        The `period` and `seasonal` arguments are reserved.
+    pred_int_kwargs : dict, optional
+        Extra arguments to pass to [`statsforecast.utils.ConformalIntervals`].
 
     References
     ----------
@@ -623,6 +629,8 @@ def __init__(
         self,
         season_length: Union[int, List[int]],
         trend_forecaster=None,
+        stl_kwargs: Optional[Dict] = None,
+        pred_int_kwargs: Optional[Dict] = None,
     ):
         super().__init__()
 
@@ -634,6 +642,8 @@ def __init__(
             self._trend_forecaster = clone(trend_forecaster)
         else:
             self._trend_forecaster = StatsForecastAutoETS(model="ZZN")
+        self.stl_kwargs = stl_kwargs
+        self.pred_int_kwargs = pred_int_kwargs
 
         # checks if trend_forecaster is already wrapped with
         # StatsForecastBackAdapter
@@ -650,6 +660,14 @@ def __init__(
                     "forecaster."
                 )
 
+        # check if prediction interval kwargs are passed
+        if self.pred_int_kwargs:
+            from statsforecast.utils import ConformalIntervals
+
+            self._trend_forecaster.prediction_intervals = ConformalIntervals(
+                **self.pred_int_kwargs
+            )
+
     def _get_statsforecast_class(self):
         from statsforecast.models import MSTL
 
@@ -695,6 +713,12 @@ def get_test_params(cls, parameter_set="default"):
                 {
                     "season_length": 4,
                 },
+                {
+                    "season_length": 4,
+                    "pred_int_kwargs": {
+                        "n_windows": 2,
+                    },
+                },
             ]
         except ModuleNotFoundError:
             from sktime.forecasting.naive import NaiveForecaster
diff --git a/sktime/forecasting/tests/test_all_forecasters.py b/sktime/forecasting/tests/test_all_forecasters.py
index 8916dbd956f..8141ce25675 100644
--- a/sktime/forecasting/tests/test_all_forecasters.py
+++ b/sktime/forecasting/tests/test_all_forecasters.py
@@ -515,9 +515,10 @@ def test_predict_quantiles(self, estimator_instance, n_columns, fh_int_oos, alph
                 estimator_instance.predict_quantiles(fh=fh_int_oos, alpha=alpha)
 
     def _check_predict_proba(self, pred_dist, y_train, fh_int):
-        from sktime.proba.base import BaseDistribution
+        assert hasattr(pred_dist, "get_tag")
+        obj_type = pred_dist.get_tag("object_type", None, False)
+        assert obj_type == "distribution"
 
-        assert isinstance(pred_dist, BaseDistribution)
         pred_cols = pred_dist.columns
         pred_index = pred_dist.index
 
diff --git a/sktime/tests/test_all_estimators.py b/sktime/tests/test_all_estimators.py
index 82f20ef0a32..166a7275e37 100644
--- a/sktime/tests/test_all_estimators.py
+++ b/sktime/tests/test_all_estimators.py
@@ -38,6 +38,7 @@
     VALID_ESTIMATOR_TYPES,
     VALID_TRANSFORMER_TYPES,
 )
+from sktime.tests.test_switch import run_test_for_class
 from sktime.utils._testing._conditional_fixtures import (
     create_conditional_fixtures_and_names,
 )
@@ -50,12 +51,10 @@
     _list_required_methods,
 )
 from sktime.utils._testing.scenarios_getter import retrieve_scenarios
-from sktime.utils.git_diff import is_class_changed
 from sktime.utils.random_state import set_random_state
 from sktime.utils.sampling import random_partition
 from sktime.utils.validation._dependencies import (
     _check_dl_dependencies,
-    _check_estimator_deps,
     _check_soft_dependencies,
 )
 
@@ -220,10 +219,11 @@ def _all_estimators(self):
         if MATRIXDESIGN:
             est_list = subsample_by_version_os(est_list)
 
-        # this setting ensures that only estimators are tested that have changed
-        # in the sense that any line in the module is different from main
-        if ONLY_CHANGED_MODULES:
-            est_list = [est for est in est_list if is_class_changed(est)]
+        # run_test_for_class selects the estimators to run
+        # based on whether they have changed, and whether they have all dependencies
+        # internally, uses the ONLY_CHANGED_MODULES flag,
+        # and checks the python env against python_dependencies tag
+        est_list = [est for est in est_list if run_test_for_class(est)]
 
         return est_list
 
@@ -274,13 +274,6 @@ def _generate_estimator_class(self, test_name, **kwargs):
             if not self.is_excluded(test_name, est)
         ]
 
-        # exclude classes based on python version compatibility
-        estimator_classes_to_test = [
-            est
-            for est in estimator_classes_to_test
-            if _check_estimator_deps(est, severity="none")
-        ]
-
         estimator_names = [est.__name__ for est in estimator_classes_to_test]
 
         return estimator_classes_to_test, estimator_names
diff --git a/sktime/tests/test_switch.py b/sktime/tests/test_switch.py
index 84f9062bd12..1daf04cd692 100644
--- a/sktime/tests/test_switch.py
+++ b/sktime/tests/test_switch.py
@@ -3,6 +3,8 @@
 
 __author__ = ["fkiraly"]
 
+from inspect import getmro, isclass
+
 
 def run_test_for_class(cls):
     """Check if test should run for a class or function.
@@ -41,11 +43,25 @@ class for which to determine whether it should be tested
     from sktime.utils.validation._dependencies import _check_estimator_deps
 
     def _required_deps_present(obj):
+        """Check if all required soft dependencies are present, return bool."""
         if hasattr(obj, "get_class_tag"):
             return _check_estimator_deps(obj, severity="none")
         else:
             return True
 
+    def _is_class_changed_or_sktime_parents(cls):
+        """Check if class or any of its sktime parents have changed, return bool."""
+        # if cls is a function, not a class, default to is_class_changed
+        if not isclass(cls):
+            return is_class_changed(cls)
+
+        # now we know cls is a class, so has an mro
+        cls_and_parents = getmro(cls)
+        cls_and_sktime_parents = [
+            x for x in cls_and_parents if x.__module__.startswith("sktime")
+        ]
+        return any(is_class_changed(x) for x in cls_and_sktime_parents)
+
     # if any of the required soft dependencies are not present, do not run the test
     if not all(_required_deps_present(x) for x in cls):
         return False
@@ -53,7 +69,7 @@ def _required_deps_present(obj):
     # if ONLY_CHANGED_MODULES is on, run the test if and only if
     # any of the modules containing any of the classes in the list have changed
     if ONLY_CHANGED_MODULES:
-        return any(is_class_changed(x) for x in cls)
+        return any(_is_class_changed_or_sktime_parents(x) for x in cls)
 
     # otherwise
     # i.e., dependencies are present, and differential testing is disabled