/
utils.py
190 lines (148 loc) · 5.12 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
# -*- coding: utf-8 -*-
#
# Author: Taylor Smith <taylor.smith@alkaline-ml.com>
#
# Common ARIMA functions
from __future__ import absolute_import
from sklearn.utils.validation import column_or_1d
import numpy as np
from ..utils import get_callable
from ..utils.array import diff, check_endog
from ..compat.numpy import DTYPE
from .stationarity import KPSSTest, ADFTest, PPTest
from .seasonality import CHTest, OCSBTest
__all__ = [
'is_constant',
'ndiffs',
'nsdiffs'
]
VALID_TESTS = {
'kpss': KPSSTest,
'adf': ADFTest,
'pp': PPTest
}
VALID_STESTS = {
'ocsb': OCSBTest,
'ch': CHTest
}
def is_constant(x):
"""Test ``x`` for constancy.
Determine whether a vector is composed of all of the same elements
and nothing else.
Parameters
----------
x : array-like, shape=(n_samples,)
The time series vector.
Examples
--------
>>> import numpy as np
>>> x = np.array([1, 2, 3])
>>> y = np.ones(3)
>>> [is_constant(x), is_constant(y)]
[False, True]
"""
x = column_or_1d(x) # type: np.ndarray
return (x == x[0]).all()
def nsdiffs(x, m, max_D=2, test='ocsb', **kwargs):
"""Estimate the seasonal differencing term, ``D``.
Perform a test of seasonality for different levels of ``D`` to
estimate the number of seasonal differences required to make a given time
series stationary. Will select the maximum value of ``D`` for which
the time series is judged seasonally stationary by the statistical test.
Parameters
----------
x : array-like, shape=(n_samples, [n_features])
The array to difference.
m : int
The number of seasonal periods (i.e., frequency of the
time series)
max_D : int, optional (default=2)
Maximum number of seasonal differences allowed. Must
be a positive integer. The estimated value of ``D`` will not
exceed ``max_D``.
test : str, optional (default='ocsb')
Type of unit root test of seasonality to use in order
to detect seasonal periodicity. Valid tests include ("ocsb", "ch").
Note that the CHTest is very slow for large data.
Returns
-------
D : int
The estimated seasonal differencing term. This is the maximum value
of ``D`` such that ``D <= max_D`` and the time series is judged
seasonally stationary. If the time series is constant, will return 0.
"""
if max_D <= 0:
raise ValueError('max_D must be a positive integer')
# get the test - this validates m internally
testfunc = get_callable(test, VALID_STESTS)(m, **kwargs)\
.estimate_seasonal_differencing_term
x = check_endog(x, dtype=DTYPE, copy=False)
if is_constant(x):
return 0
D = 0
dodiff = testfunc(x)
while dodiff == 1 and D < max_D:
D += 1
x = diff(x, lag=m)
if is_constant(x):
return D
dodiff = testfunc(x)
return D
def ndiffs(x, alpha=0.05, test='kpss', max_d=2, **kwargs):
"""Estimate ARIMA differencing term, ``d``.
Perform a test of stationarity for different levels of ``d`` to
estimate the number of differences required to make a given time
series stationary. Will select the maximum value of ``d`` for which
the time series is judged stationary by the statistical test.
Parameters
----------
x : array-like, shape=(n_samples, [n_features])
The array (time series) to difference.
alpha : float, optional (default=0.05)
Level of the test. This is the value above below which the P-value
will be deemed significant.
test : str, optional (default='kpss')
Type of unit root test of stationarity to use in order to
test the stationarity of the time-series. One of ('kpss', 'adf', 'pp')
max_d : int, optional (default=2)
Maximum number of non-seasonal differences allowed. Must
be a positive integer. The estimated value of ``d`` will not
exceed ``max_d``.
Returns
-------
d : int
The estimated differencing term. This is the maximum value of ``d``
such that ``d <= max_d`` and the time series is judged stationary.
If the time series is constant, will return 0.
References
----------
.. [1] R's auto_arima ndiffs function: https://bit.ly/2Bu8CHN
"""
if max_d <= 0:
raise ValueError('max_d must be a positive integer')
# get the test
testfunc = get_callable(test, VALID_TESTS)(alpha, **kwargs).should_diff
x = check_endog(x, dtype=DTYPE, copy=False)
# base case, if constant return 0
d = 0
if is_constant(x):
return d
# get initial diff
pval, dodiff = testfunc(x)
# if initially NaN, return 0
if np.isnan(pval):
return 0 # (d is zero, but this is more explicit to the reader)
# Begin loop.
while dodiff and d < max_d:
d += 1
# do differencing
x = diff(x)
if is_constant(x):
return d
# get new result
pval, dodiff = testfunc(x)
# if it's NaN now, take the last non-null one
if np.isnan(pval):
return d - 1
# when d >= max_d
return d