Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Empty file.
430 changes: 430 additions & 0 deletions tsml_eval/_wip/unequal_length/_arsenal.py

Large diffs are not rendered by default.

295 changes: 295 additions & 0 deletions tsml_eval/_wip/unequal_length/_drcif.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,295 @@
"""DrCIF classifier.

Interval-based DrCIF classifier extracting catch22 features from random intervals on
periodogram and differences representations as well as the base series.
"""

__maintainer__ = []
__all__ = ["DrCIFClassifier"]

import numpy as np
from sklearn.preprocessing import FunctionTransformer

from aeon.classification.base import BaseClassifier
from aeon.classification.sklearn._continuous_interval_tree import ContinuousIntervalTree
from aeon.transformations.collection import PeriodogramTransformer
from aeon.transformations.collection.feature_based import Catch22
from aeon.utils.numba.general import first_order_differences_3d
from aeon.utils.numba.stats import (
row_iqr,
row_mean,
row_median,
row_numba_max,
row_numba_min,
row_slope,
row_std,
)


class DrCIFClassifier(BaseIntervalForest, BaseClassifier):
"""
Diverse Representation Canonical Interval Forest Classifier (DrCIF).

Extension of the CIF algorithm using multiple representations. Implementation of the
interval-based forest making use of the catch22 feature set on randomly selected
intervals on the base series, periodogram representation and differences
representation described in the HIVE-COTE 2.0 paper Middlehurst et al (2021). [1]_

Overview: Input "n" series with "d" dimensions of length "m".
For each tree
- Sample n_intervals intervals per representation of random position and length
- Subsample att_subsample_size catch22 or summary statistic attributes randomly
- Randomly select dimension for each interval
- Calculate attributes for each interval from its representation, concatenate
to form new data set
- Build a decision tree on new data set
Ensemble the trees with averaged probability estimates

Parameters
----------
base_estimator : BaseEstimator or None, default=None
scikit-learn BaseEstimator used to build the interval ensemble. If None, use a
simple decision tree.
n_estimators : int, default=200
Number of estimators to build for the ensemble.
n_intervals : int, str, list or tuple, default="sqrt"
Number of intervals to extract per tree for each series_transformers series.

An int input will extract that number of intervals from the series, while a str
input will return a function of the series length (may differ per
series_transformers output) to extract that number of intervals.
Valid str inputs are:
- "sqrt": square root of the series length.
- "sqrt-div": sqrt of series length divided by the number
of series_transformers.

A list or tuple of ints and/or strs will extract the number of intervals using
the above rules and sum the results for the final n_intervals. i.e. [4, "sqrt"]
will extract sqrt(n_timepoints) + 4 intervals.

Different number of intervals for each series_transformers series can be
specified using a nested list or tuple. Any list or tuple input containing
another list or tuple must be the same length as the number of
series_transformers.

While random interval extraction will extract the n_intervals intervals total
(removing duplicates), supervised intervals will run the supervised extraction
process n_intervals times, returning more intervals than specified.
min_interval_length : int, float, list, or tuple, default=3
Minimum length of intervals to extract from series. float inputs take a
proportion of the series length to use as the minimum interval length.

Different minimum interval lengths for each series_transformers series can be
specified using a list or tuple. Any list or tuple input must be the same length
as the number of series_transformers.
max_interval_length : int, float, list, or tuple, default=np.inf
Maximum length of intervals to extract from series. float inputs take a
proportion of the series length to use as the maximum interval length.

Different maximum interval lengths for each series_transformers series can be
specified using a list or tuple. Any list or tuple input must be the same length
as the number of series_transformers.

Ignored for supervised interval_selection_method inputs.
att_subsample_size : int, float, list, tuple or None, default=None
The number of attributes to subsample for each estimator. If None, use all

If int, use that number of attributes for all estimators. If float, use that
proportion of attributes for all estimators.

Different subsample sizes for each series_transformers series can be specified
using a list or tuple. Any list or tuple input must be the same length as the
number of series_transformers.
time_limit_in_minutes : int, default=0
Time contract to limit build time in minutes, overriding n_estimators.
Default of 0 means n_estimators are used.
contract_max_n_estimators : int, default=500
Max number of estimators when time_limit_in_minutes is set.
use_pycatch22 : bool, optional, default=False
Wraps the C based pycatch22 implementation for aeon.
(https://github.com/DynamicsAndNeuralSystems/pycatch22). This requires the
``pycatch22`` package to be installed if True.
random_state : int, RandomState instance or None, default=None
If `int`, random_state is the seed used by the random number generator;
If `RandomState` instance, random_state is the random number generator;
If `None`, the random number generator is the `RandomState` instance used
by `np.random`.
n_jobs : int, default=1
The number of jobs to run in parallel for both `fit` and `predict`.
``-1`` means using all processors.
parallel_backend : str, ParallelBackendBase instance or None, default=None
Specify the parallelisation backend implementation in joblib, if None a 'prefer'
value of "threads" is used by default.
Valid options are "loky", "multiprocessing", "threading" or a custom backend.
See the joblib Parallel documentation for more details.

Attributes
----------
n_cases_ : int
The number of train cases in the training set.
n_channels_ : int
The number of dimensions per case in the training set.
n_timepoints_ : int
The length of each series in the training set.
n_classes_ : int
Number of classes. Extracted from the data.
classes_ : ndarray of shape (n_classes_)
Holds the label for each class.
total_intervals_ : int
Total number of intervals per tree from all representations.
estimators_ : list of shape (n_estimators) of BaseEstimator
The collections of estimators trained in fit.
intervals_ : list of shape (n_estimators) of TransformerMixin
Stores the interval extraction transformer for all estimators.

See Also
--------
DrCIFRegressor
CanonicalIntervalForestClassifier

Notes
-----
For the Java version, see
`TSML <https://github.com/uea-machine-learning/tsml/blob/master/src/main/java
/tsml/classifiers/interval_based/DrCIF.java>`_.

References
----------
.. [1] Middlehurst, Matthew, James Large, Michael Flynn, Jason Lines, Aaron Bostrom,
and Anthony Bagnall. "HIVE-COTE 2.0: a new meta ensemble for time series
classification." arXiv preprint arXiv:2104.07551 (2021).

Examples
--------
>>> from aeon.classification.interval_based import DrCIFClassifier
>>> from aeon.testing.data_generation import make_example_3d_numpy
>>> X, y = make_example_3d_numpy(n_cases=10, n_channels=1, n_timepoints=12,
... return_y=True, random_state=0)
>>> clf = DrCIFClassifier(n_estimators=10, random_state=0)
>>> clf.fit(X, y)
DrCIFClassifier(n_estimators=10, random_state=0)
>>> clf.predict(X)
array([0, 1, 0, 1, 0, 0, 1, 1, 1, 0])
"""

_tags = {
"capability:multivariate": True,
"capability:train_estimate": True,
"capability:contractable": True,
"capability:multithreading": True,
"algorithm_type": "interval",
}

def __init__(
self,
base_estimator=None,
n_estimators=200,
n_intervals=(4, "sqrt-div"),
min_interval_length=3,
max_interval_length=0.5,
att_subsample_size=10,
time_limit_in_minutes=None,
contract_max_n_estimators=500,
use_pycatch22=False,
random_state=None,
n_jobs=1,
parallel_backend=None,
):
self.use_pycatch22 = use_pycatch22

if isinstance(base_estimator, ContinuousIntervalTree):
replace_nan = "nan"
else:
replace_nan = 0

series_transformers = [
None,
FunctionTransformer(func=first_order_differences_3d, validate=False),
PeriodogramTransformer(),
]

interval_features = [
Catch22(outlier_norm=True, use_pycatch22=use_pycatch22),
row_mean,
row_std,
row_slope,
row_median,
row_iqr,
row_numba_min,
row_numba_max,
]

super().__init__(
base_estimator=base_estimator,
n_estimators=n_estimators,
interval_selection_method="random",
n_intervals=n_intervals,
min_interval_length=min_interval_length,
max_interval_length=max_interval_length,
interval_features=interval_features,
series_transformers=series_transformers,
att_subsample_size=att_subsample_size,
replace_nan=replace_nan,
time_limit_in_minutes=time_limit_in_minutes,
contract_max_n_estimators=contract_max_n_estimators,
random_state=random_state,
n_jobs=n_jobs,
parallel_backend=parallel_backend,
)

if use_pycatch22:
self.set_tags(**{"python_dependencies": "pycatch22"})

def _fit(self, X, y):
return super()._fit(X, y)

def _predict(self, X) -> np.ndarray:
return super()._predict(X)

def _predict_proba(self, X) -> np.ndarray:
return super()._predict_proba(X)

def _fit_predict(self, X, y) -> np.ndarray:
return super()._fit_predict(X, y)

def _fit_predict_proba(self, X, y) -> np.ndarray:
return super()._fit_predict_proba(X, y)

@classmethod
def _get_test_params(cls, parameter_set="default"):
"""Return testing parameter settings for the estimator.

Parameters
----------
parameter_set : str, default="default"
Name of the set of test parameters to return, for use in tests. If no
special parameters are defined for a value, will return `"default"` set.
DrCIFClassifier provides the following special sets:
"results_comparison" - used in some classifiers to compare against
previously generated results where the default set of parameters
cannot produce suitable probability estimates
"contracting" - used in classifiers that set the
"capability:contractable" tag to True to test contacting
functionality
"train_estimate" - used in some classifiers that set the
"capability:train_estimate" tag to True to allow for more efficient
testing when relevant parameters are available

Returns
-------
params : dict or list of dict, default={}
Parameters to create testing instances of the class.
Each dict are parameters to construct an "interesting" test instance, i.e.,
`MyClass(**params)` or `MyClass(**params[i])` creates a valid test instance.
"""
if parameter_set == "results_comparison":
return {"n_estimators": 10, "n_intervals": 2, "att_subsample_size": 4}
elif parameter_set == "contracting":
return {
"time_limit_in_minutes": 5,
"contract_max_n_estimators": 2,
"n_intervals": 2,
"att_subsample_size": 2,
}
else:
return {"n_estimators": 2, "n_intervals": 2, "att_subsample_size": 2}
Loading
Loading