-
Notifications
You must be signed in to change notification settings - Fork 1
/
toy_datasets.py
204 lines (160 loc) · 7.1 KB
/
toy_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
"""
Toy video datasets
let's base these off of the TIMIT sequence dataset here: https://github.com/vdumoulin/research/blob/master/code/pylearn2/datasets/timit.py
:development:
(1) you can't create a shared var of a matrix of differing length vectors, though there are some alternative. One is by using a mask.
:questions:
(1) how does the iterator use the data_specs?
(2) at what point are the features actually labeled with the string 'features'? Does this ever even occur?
"""
import sys
import numpy as np
import functools
import theano
from pylearn2.datasets.dataset import Dataset
from pylearn2.utils.iteration import FiniteDatasetIterator, resolve_iterator_class
from pylearn2.sandbox.rnn.utils.iteration import SequenceDatasetIterator
from pylearn2.sandbox.rnn.space import SequenceSpace, SequenceDataSpace
from pylearn2.space import VectorSpace, CompositeSpace, VectorSequenceSpace
import chest_accel
def hard_data_generator(n_classes, n_examples, n_frames, n_features):
rng = np.random.RandomState(seed=42)
X = rng.normal(size=(n_examples, n_frames, n_features))
y = np.random.randint(0, n_classes, (n_examples, 1))
return X, y
def hard_data_generator_reconstruct(n_classes, n_examples, n_frames, n_features):
rng = np.random.RandomState(seed=42)
X = rng.normal(size=(n_examples, n_frames, n_features))
y = rng.normal(size=(n_examples, n_frames, n_features))
return X, y
def hard_data_generator_random_reconstruct(n_classes, n_examples, n_frames, n_features):
rng = np.random.RandomState(seed=42)
X = rng.normal(size=(n_examples, n_frames, n_features))
return X, X
def easy_data_generator(n_classes, n_examples, n_frames, n_features):
rng = np.random.RandomState(seed=42)
y = np.random.randint(0, n_classes, (n_examples))
# n_frames is the maximum number of binary digits needed to represent a class label
n_frames = len(bin(n_classes)) - 2
X = []
for target in y:
example = []
binary = '{0:0{1}b}'.format(int(target), n_frames)
for char in binary:
if char == '0':
example.append(np.zeros((n_features)))
else:
example.append(np.ones((n_features)))
X.append(example)
X = np.array(X)
return X, y
def easy_data_generator_reconstruct(n_classes, n_examples, n_frames, n_features):
rng = np.random.RandomState(seed=42)
y = np.random.randint(0, n_classes, (n_examples, 1))
# n_frames is the maximum number of binary digits needed to represent a class label
n_frames = len(bin(n_classes)) - 2
X = []
for target in y:
example = []
binary = '{0:0{1}b}'.format(int(target), n_frames)
for char in binary:
if char == '0':
example.append(np.zeros((n_features)))
else:
example.append(np.ones((n_features)))
X.append(example)
X = np.array(X)
return X, X
def super_easy_data_generator_reconstruct(n_classes, n_examples, n_frames, n_features):
X = np.ones((n_examples, n_frames, n_features))
return X, X
def easy_softmax_data_generator(n_classes, n_examples, n_frames=None, n_features=None):
"""
:description: returns one hot vectors that exactly correspond to the target values.
example: y = [1,2] X = [[0,1,0], [1,0,0] ]
"""
y = np.random.randint(0, n_classes + 1, (n_examples))
X = np.zeros((n_examples, n_classes + 1))
for example, target in zip(X, y):
example[target] = 1
return X, y
def chest_accel_data_loader(n_classes, n_examples, n_frames, n_features):
return chest_accel.load_data_from_dir()
def get_sequence_lengths(data):
return np.array([len(sample) for sample in data])
class ToyVideoDataset(Dataset):
"""
Class for toy video datasets.
:description: This class represents a toy video dataset. That is, a set of examples where each example is a sequence of values where those values may be lists. The class uses a passed in function to generate the data.
Data in the format (examples, time, data) such that:
>>> X[0,:,:]
gives the first example
>>> X[0,0,:]
gives the first time step of the first example
>>> X[0,0,0]
gives the first feature from the first time step of the first example
:development:
"""
def __init__(self, data_generator=None, n_classes=101, n_examples=10, n_frames=10, n_features=4096):
"""
:type data_generator: function
:param data_generator: function used to generate data in the form of X, y tuple. X is a 3-dimensional array with dimensions (examples, frames/time, features). y is a 2-dimensional array with dimensions (examples, target values). Optional value defaults to generating random therefore 'hard' data.
:type n_classes: int
:param n_classes: the number of possible target values or n_classes
:type n_examples: int
:param n_examples: the number of examples to be generated in the dataset
:type n_frames: int
:param n_frames: the number of frames or time steps in each example
:type n_features: int
:param n_features: the number of features in each time step
"""
rng = np.random.RandomState(seed=42)
self.n_features = n_features
self.n_examples = n_examples
if data_generator is None:
data_generator = hard_data_generator
self.data_generator = data_generator
self.X, self.y = self.data_generator(n_classes, n_examples, n_frames, n_features)
features_space = VectorSequenceSpace(dim=self.n_features)
# features_space = SequenceDataSpace(VectorSpace(dim=self.n_features))
targets_space = VectorSequenceSpace(dim=1)
# targets_space = SequenceDataSpace(VectorSpace(dim=1))
space_components = [features_space, targets_space]
space = CompositeSpace(space_components)
source = ('features', 'targets')
self.data_specs = (space, source)
self._iter_mode = resolve_iterator_class('shuffled_sequential')
self._iter_data_specs = (CompositeSpace((features_space, targets_space)), source)
def get_data_specs(self):
return self.data_specs
def get_num_examples(self):
return self.n_examples
def get_data(self):
return self.X, self.y
def get_data_as_shared(self):
shared_x = theano.shared(np.asarray(self.X, dtype=theano.config.floatX), borrow=True)
shared_y = theano.shared(self.y, borrow=True)
return shared_x, shared_y
def get(self, source, index):
"""
:description: the iterator object repeatedly calls this 'get' function with different indicies.
:type source: list of strings or string
:param source: this is passed in b/c X and y are generally referred to as 'sources' and retrieved that way. The below approach is really a workaround to allow for approriately indexing X (or might be a hack that comes back to get me)
"""
# index[0] for X b/c index is itself a list, and using it as it results in the returned value from X being 1 dimension greater than it should be. I am not sure why index is a list.
# does this need to change?
rval = (self.X[index[0]], self.y[index])
# print('X.shape: {}'.format(self.X.shape))
# print('y.shape: {}'.format(self.y.shape))
# print('X[index]: {}'.format(self.X[index]))
# print('y[index]: {}'.format(self.y[index]))
return rval
@functools.wraps(Dataset.iterator)
def iterator(self):
"""
:description: returns an iterator object over this dataset
"""
data_specs = self._iter_data_specs
mode = self._iter_mode
batch_size = 1
return FiniteDatasetIterator(self, mode(self.n_examples, batch_size, None, None), data_specs=data_specs)