-
Notifications
You must be signed in to change notification settings - Fork 29
/
Copy pathdata_utils.py
78 lines (65 loc) · 3.88 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
import numpy as np
def get_var_size(data):
num_records, num_vars = data.shape
node_size = np.zeros((num_vars,), dtype='int')
for var in range(num_vars):
node_size[var] = data[:, var].max() + 1
# node_size = data.max(axis=0)+1 # number of node states (smallest state 0)
return node_size
def calc_stats(data, var_size, weights=None):
"""
Calculate the counts of instances in the data
:param data: a dataset of categorical features
:param var_size: a vector defining the cardinalities of the features
:param weights: a vector of non-negative weights for data samples.
:return:
"""
sz_cum_prod = [1]
for node_i in range(len(var_size) - 1):
sz_cum_prod += [sz_cum_prod[-1] * var_size[node_i]]
sz_prod = sz_cum_prod[-1] * var_size[-1]
data_idx = np.dot(data, sz_cum_prod)
try:
# hist_count, _ = np.histogram(data_idx, np.arange(sz_prod + 1), weights=weights)
hist_count = np.bincount(data_idx, minlength=sz_prod, weights=weights)
except MemoryError as error:
print('Out of memory')
return None
return hist_count
def unroll_temporal_data(data_full, observed_nodes_list, window_len, t_step=1, reverse_temporal_order=False):
"""
Unroll temporally sorted data samples into the defined time window. For example, if the time window is 2, then
each two consecutive data samples are concatenated into a single sample. The new samples are sorted temporally.
:param data_full: Temporally sorted data sample. Each sample consists of jointly measured values.
:param observed_nodes_list: indexes of columns in the original data that correspond to 'observed' variables.
:param window_len: Length (number of time-stamps) of the unrolling window (current-time + window_len-1 past-steps).
:param t_step: Nuber of time-step skips between unrolled samples (default is 1).
:param reverse_temporal_order: if True, then indexes [0..n] represent the latest instances and
higher indexes represent past instances. If False, [0..n] represent indexes of the earliest instances and
higher indexes represent future instances.
:return: An unrolled temporally sorted data samples.
"""
n_samples = data_full.shape[0] # number of data samples
n_contemporaneous_nodes = data_full.shape[1] # number of contemporaneous (jointly measured) variables
num_nodes_unrolled = window_len * n_contemporaneous_nodes # number of variables in a single unrolled sample
# calculate the starting time index for each unrolled sample
starts = [xid * n_contemporaneous_nodes for xid in np.arange(0, n_samples + 1 - window_len, t_step)]
# create unrolled data
data_full_unrolled = np.zeros((len(starts), num_nodes_unrolled)) # initialize unrolled data
for i, st in enumerate(starts):
data_full_unrolled[i] = data_full.flat[st:st + num_nodes_unrolled]
# indexes of variables in the unrolled data
_nodes_sets_list_full = np.reshape(range(num_nodes_unrolled), (window_len, n_contemporaneous_nodes))
_nodes_sets_list = [_nodes_sets_list_full[i, observed_nodes_list].tolist() for i in range(window_len)] # observed
if reverse_temporal_order:
# reverse time order such that indexes [0..n_contemporaneous_nodes-1] represent the latest instances,
# whereas larger indexes represent earlier instances.
data_full_unrolled_reversed = np.zeros_like(data_full_unrolled)
for idxes, idxes_rev in zip(_nodes_sets_list, reversed(_nodes_sets_list)):
data_full_unrolled_reversed[:, idxes] = data_full_unrolled[:, idxes_rev]
data_full_unrolled = data_full_unrolled_reversed
else:
# increasing time order such that indexes [0..n_contemporaneous_nodes-1] represent the earliest instances,
# whereas larger indexes represent future instances.
pass
return data_full_unrolled, _nodes_sets_list_full, _nodes_sets_list