-
Notifications
You must be signed in to change notification settings - Fork 5
/
xarray_zarr_array.hpp
278 lines (249 loc) · 11.3 KB
/
xarray_zarr_array.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
/*
* Copyright (c) 2024 MPI-M, Clara Bayley
*
*
* ----- CLEO -----
* File: xarray_zarr_array.hpp
* Project: zarr
* Created Date: Monday 18th March 2024
* Author: Clara Bayley (CB)
* Additional Contributors:
* -----
* Last Modified: Wednesday 22nd May 2024
* Modified By: CB
* -----
* License: BSD 3-Clause "New" or "Revised" License
* https://opensource.org/licenses/BSD-3-Clause
* -----
* File Description:
* Structure to create a group obeying the Zarr storage specification version 2
* (https://zarr.readthedocs.io/en/stable/spec/v2.html) in a given memory store.
*/
#ifndef LIBS_ZARR_XARRAY_ZARR_ARRAY_HPP_
#define LIBS_ZARR_XARRAY_ZARR_ARRAY_HPP_
#include <Kokkos_Core.hpp>
#include <Kokkos_Pair.hpp>
#include <algorithm>
#include <cassert>
#include <string>
#include <string_view>
#include <unordered_map>
#include <vector>
#include "./xarray_metadata.hpp"
#include "./zarr_array.hpp"
/**
* @brief Write attributes string to a store under a .zattrs key.
*
* Write some data under .zattrs key in store for an array called 'name'. The key and attrs data
* could be anything, but for example .zattrs could be a json file in a file system store
* (see FSStore) for the extra metadata which must exist in order to make Xarray and netCDF
* happy when opening a Zarr dataset, e.g. by naming the dimensions of the
* "{\"_ARRAY_DIMENSIONS\": [\"dimension_name\"]}";.
*
* @tparam Store The type of the store object where the metadata will be written.
* @param store The store object where the metadata will be written.
* @param name The name under which the .zarray key will be stored in the store.
* @param metadata The metadata to write for the .zarray key.
*/
template <typename Store>
inline void write_zattrs_json(Store& store, std::string_view name, std::string_view attrs) {
store[std::string(name) + "/.zattrs"] = attrs;
}
/**
* @brief Calculate the reduced array shape of an array given the name of its dimensions and the
* dataset's dimensions.
*
* Given the dimensions of a dataset and the names of the dimensions of an array, this function
* calculates the reduced array shape by extracting the sizes of the dimensions from the dataset
* which correspond to the provided dimension names for all except for the outermost dimension of
* the array.
*
* @param datasetdims An unordered map containing the dimensions of the dataset.
* @param dimnames A vector containing the names of the dimensions of the array (ordered from
* outermost->innermost).
* @return A vector of size_t representing the reduced array shape.
*/
inline std::vector<size_t> reduced_arrayshape_from_dims(
const std::unordered_map<std::string, size_t>& datasetdims,
const std::vector<std::string>& dimnames) {
auto reduced_arrayshape = std::vector<size_t>({});
for (size_t aa = 1; aa < dimnames.size(); ++aa) {
const auto dsize = datasetdims.at(dimnames.at(aa)); // number of elements along a dimension
reduced_arrayshape.push_back(dsize);
}
return reduced_arrayshape;
}
/**
* @brief Zarr array with additional metadata and functions to constrain the shape of array to the
* shape of its dimensions in a dataset in order to ensure Zarr array is compatibile with NetCDF
* and Xarray conventions.
*
* @tparam Store The type of the store object where the array will be stored.
* @tparam T The data type of the array.
*/
template <typename Store, typename T>
class XarrayZarrArray {
private:
using viewh_buffer = Buffer<T>::viewh_buffer;
ZarrArray<Store, T> zarr; /**< zarr array in store */
std::vector<std::string> dimnames; /**< ordered list of names of each dimenion of array */
std::vector<size_t> arrayshape; /**< current size of the array along each of its dimensions */
size_t last_totnchunks; /**< Number of chunks of array since arrayshape last written */
/**
* @brief Sets shape of array along each dimension to be the same size as each of its dimensions
* according to the dataset. Returns boolean for whether shape has changed (true) or not (false).
*
* The order of the dimensions in the array's shape is the order of dimensions in dimnames
* (outermost -> innermost). Setting the shape to be conistent with the size of the dataset's
* dimensions makes zarr array also consistent with Xarray and NetCDF conventions. Boolean
* returns true if the shape of the array along any of its dimensions has changed.
*
* @param datasetdims Dictionary like object for the dimensions of the dataset.
* @return bool = true if arrayshape along any of its dimensions has changed, false otherwise.
*/
bool set_arrayshape(const std::unordered_map<std::string, size_t>& datasetdims) {
auto ischange = std::vector<int>(arrayshape.size(), 0);
for (size_t aa = 0; aa < dimnames.size(); ++aa) {
const auto dsize = datasetdims.at(dimnames.at(aa));
ischange.at(aa) = dsize - arrayshape.at(aa);
arrayshape.at(aa) = dsize;
}
return std::any_of(ischange.begin(), ischange.end(), [](bool b) { return b; });
}
/**
* @brief Sets shape of array along each dimension to be the same as shape according to zarr.
*
* Useful when writing a ragged arrray in a dataset (meaning length of dimensions if not length of
* array)
*
*/
bool set_ragged_arrayshape() {
const auto raggedarrayshape = std::vector<size_t>{zarr.get_totalndata()};
const auto ischange = (arrayshape != raggedarrayshape);
arrayshape = raggedarrayshape;
return ischange;
}
public:
/**
* @brief Constructs a new XarrayZarrArray object.
*
* @param store The store where the array will be stored.
* @param datasetdims Dictionary like object for the dimensions of the dataset.
* @param name The name of the array.
* @param units The units of the array data.
* @param scale_factor The scale factor of array data.
* @param chunkshape The shape of the array chunks.
* @param dimnames The names of each dimension of the array (in order outermost->innermost).
*/
XarrayZarrArray(Store& store, const std::unordered_map<std::string, size_t>& datasetdims,
const std::string_view name, const std::string_view units,
const double scale_factor, const std::vector<size_t>& chunkshape,
const std::vector<std::string>& dimnames)
: zarr(store, name, chunkshape, true, reduced_arrayshape_from_dims(datasetdims, dimnames)),
dimnames(dimnames),
arrayshape(dimnames.size(), 0),
last_totnchunks(0) {
assert((chunkshape.size() == dimnames.size()) &&
"number of named dimensions of array must match number dimensions of chunks");
write_arrayshape(datasetdims);
write_zattrs_json(store, name, xarray_metadata<T>(units, scale_factor, dimnames));
}
/**
* @brief Constructs a new XarrayZarrArray object with additional variable called
* "sample_dimension" in the metadata .zattrs json and initially no set arrayshape.
*
* @param store The store where the array will be stored.
* @param datasetdims Dictionary like object for the dimensions of the dataset.
* @param name The name of the array.
* @param units The units of the array data.
* @param scale_factor The scale factor of array data.
* @param chunkshape The shape of the array chunks.
* @param dimnames The names of each dimension of the array (in order outermost->innermost).
* @param sampledimname The name of the dimension the ragged count samples.
*/
XarrayZarrArray(Store& store, const std::unordered_map<std::string, size_t>& datasetdims,
const std::string_view name, const std::string_view units,
const double scale_factor, const std::vector<size_t>& chunkshape,
const std::vector<std::string>& dimnames, const std::string_view sampledimname)
: zarr(store, name, chunkshape, true, reduced_arrayshape_from_dims(datasetdims, dimnames)),
dimnames(dimnames),
arrayshape(dimnames.size(), 0),
last_totnchunks(0) {
assert((chunkshape.size() == dimnames.size()) &&
"number of named dimensions of array must match number dimensions of chunks");
write_zattrs_json(store, name,
xarray_metadata<T>(units, scale_factor, dimnames, sampledimname));
}
~XarrayZarrArray() { zarr.write_arrayshape(arrayshape); }
/**
* @brief Returns the name and size of the dimensions of the array (unordered).
*
* @return An unordered map containing the current dimensions of the array.
*/
std::unordered_map<std::string, size_t> get_arraydims() const {
auto arraydims = std::unordered_map<std::string, size_t>();
for (size_t aa = 0; aa < dimnames.size(); ++aa) {
arraydims.insert({dimnames.at(aa), arrayshape.at(aa)});
}
return arraydims;
}
/**
* @brief Writes data from Kokkos view in host memory to chunks of a Zarr array in a store
* via a buffer. Function does *not* write metadata to zarray .json file.
*
* Calls ZarrArray's write_to_array function to write data from Kokkos view in host memory to
* chunks of a Zarr array in a store.
*
* @param h_data The data in a Kokkos view in host memory which should be written to the array
* in a store.
*/
void write_to_array(const viewh_buffer h_data) { zarr.write_to_array(h_data); };
/**
* @brief Writes 1 data element to a Zarr array in a store.
* Function does *not* write metadata to zarray .json file.
*
* Calls ZarrArray's write_to_array function to write data to a Zarr array in a store (in chunks
* via a buffer).
*
* @param data The data element which should be written to the array in a store.
*/
void write_to_array(const T data) { zarr.write_to_array(data); };
/**
* @brief Sets shape of array along each dimension to be the same size as each of its dimensions
* according to the dataset.
*
* The order of the dimensions in the array's shape is the order of dimensions in dimnames
* (outermost -> innermost). Setting the shape to be conistent with the size of the dataset's
* dimensions makes zarr array also consistent with Xarray and NetCDF conventions. If chunks have
* been written since last writing of the arrayshape, and the shape of the array has changed, then
* function also overwrites the .zarray json file with metadata containing the new shape of the
* array.
*
* @param datasetdims Dictionary like object for the dimensions of the dataset.
*/
void write_arrayshape(const std::unordered_map<std::string, size_t>& datasetdims) {
auto ischange = set_arrayshape(datasetdims);
if (last_totnchunks != zarr.get_totnchunks() && ischange) {
zarr.write_arrayshape(arrayshape);
last_totnchunks = zarr.get_totnchunks();
}
}
/**
* @brief Sets shape of array along each dimension to be the same a expected for a 1-D ragged
* array.
*
* Expected shape is 1-D array with size of the total number of elements written to a zarr array.
*
* If chunks have been written since last writing of the arrayshape, then function also overwrites
* the .zarray json file with metadata containing the new shape of the array.
*
*/
void write_ragged_arrayshape() {
auto ischange = set_ragged_arrayshape();
if (last_totnchunks != zarr.get_totnchunks() && ischange) {
zarr.write_arrayshape(arrayshape);
last_totnchunks = zarr.get_totnchunks();
}
}
};
#endif // LIBS_ZARR_XARRAY_ZARR_ARRAY_HPP_