/
sequence.py
162 lines (142 loc) · 6.62 KB
/
sequence.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# Copyright 2015 The TensorFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Utilities for preprocessing sequence data.
"""
# pylint: disable=invalid-name
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from keras_preprocessing import sequence
from tensorflow.python.keras.utils import data_utils
from tensorflow.python.util.tf_export import keras_export
make_sampling_table = sequence.make_sampling_table
skipgrams = sequence.skipgrams
# TODO(fchollet): consider making `_remove_long_seq` public.
_remove_long_seq = sequence._remove_long_seq # pylint: disable=protected-access
@keras_export('keras.preprocessing.sequence.TimeseriesGenerator')
class TimeseriesGenerator(sequence.TimeseriesGenerator, data_utils.Sequence):
"""Utility class for generating batches of temporal data.
This class takes in a sequence of data-points gathered at
equal intervals, along with time series parameters such as
stride, length of history, etc., to produce batches for
training/validation.
# Arguments
data: Indexable generator (such as list or Numpy array)
containing consecutive data points (timesteps).
The data should be at 2D, and axis 0 is expected
to be the time dimension.
targets: Targets corresponding to timesteps in `data`.
It should have same length as `data`.
length: Length of the output sequences (in number of timesteps).
sampling_rate: Period between successive individual timesteps
within sequences. For rate `r`, timesteps
`data[i]`, `data[i-r]`, ... `data[i - length]`
are used for create a sample sequence.
stride: Period between successive output sequences.
For stride `s`, consecutive output samples would
be centered around `data[i]`, `data[i+s]`, `data[i+2*s]`, etc.
start_index: Data points earlier than `start_index` will not be used
in the output sequences. This is useful to reserve part of the
data for test or validation.
end_index: Data points later than `end_index` will not be used
in the output sequences. This is useful to reserve part of the
data for test or validation.
shuffle: Whether to shuffle output samples,
or instead draw them in chronological order.
reverse: Boolean: if `true`, timesteps in each output sample will be
in reverse chronological order.
batch_size: Number of timeseries samples in each batch
(except maybe the last one).
# Returns
A [Sequence](/utils/#sequence) instance.
# Examples
```python
from keras.preprocessing.sequence import TimeseriesGenerator
import numpy as np
data = np.array([[i] for i in range(50)])
targets = np.array([[i] for i in range(50)])
data_gen = TimeseriesGenerator(data, targets,
length=10, sampling_rate=2,
batch_size=2)
assert len(data_gen) == 20
batch_0 = data_gen[0]
x, y = batch_0
assert np.array_equal(x,
np.array([[[0], [2], [4], [6], [8]],
[[1], [3], [5], [7], [9]]]))
assert np.array_equal(y,
np.array([[10], [11]]))
```
"""
pass
@keras_export('keras.preprocessing.sequence.pad_sequences')
def pad_sequences(sequences, maxlen=None, dtype='int32',
padding='pre', truncating='pre', value=0.):
"""Pads sequences to the same length.
This function transforms a list (of length `num_samples`)
of sequences (lists of integers)
into a 2D Numpy array of shape `(num_samples, num_timesteps)`.
`num_timesteps` is either the `maxlen` argument if provided,
or the length of the longest sequence in the list.
Sequences that are shorter than `num_timesteps`
are padded with `value` until they are `num_timesteps` long.
Sequences longer than `num_timesteps` are truncated
so that they fit the desired length.
The position where padding or truncation happens is determined by
the arguments `padding` and `truncating`, respectively.
Pre-padding or removing values from the beginning of the sequence is the
default.
>>> sequence = [[1], [2, 3], [4, 5, 6]]
>>> tf.keras.preprocessing.sequence.pad_sequences(sequence)
array([[0, 0, 1],
[0, 2, 3],
[4, 5, 6]], dtype=int32)
>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, value=-1)
array([[-1, -1, 1],
[-1, 2, 3],
[ 4, 5, 6]], dtype=int32)
>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, padding='post')
array([[1, 0, 0],
[2, 3, 0],
[4, 5, 6]], dtype=int32)
>>> tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=2)
array([[0, 1],
[2, 3],
[5, 6]], dtype=int32)
Arguments:
sequences: List of sequences (each sequence is a list of integers).
maxlen: Optional Int, maximum length of all sequences. If not provided,
sequences will be padded to the length of the longest individual
sequence.
dtype: (Optional, defaults to int32). Type of the output sequences.
To pad sequences with variable length strings, you can use `object`.
padding: String, 'pre' or 'post' (optional, defaults to 'pre'):
pad either before or after each sequence.
truncating: String, 'pre' or 'post' (optional, defaults to 'pre'):
remove values from sequences larger than
`maxlen`, either at the beginning or at the end of the sequences.
value: Float or String, padding value. (Optional, defaults to 0.)
Returns:
Numpy array with shape `(len(sequences), maxlen)`
Raises:
ValueError: In case of invalid values for `truncating` or `padding`,
or in case of invalid shape for a `sequences` entry.
"""
return sequence.pad_sequences(
sequences, maxlen=maxlen, dtype=dtype,
padding=padding, truncating=truncating, value=value)
keras_export(
'keras.preprocessing.sequence.make_sampling_table')(make_sampling_table)
keras_export('keras.preprocessing.sequence.skipgrams')(skipgrams)