## Description

if your dataset is very large then you can split it into several TFRecords files called shards. This will also improve the random shuffling, because the Dataset API only shuffles from a smaller buffer of e.g. 1024 elements loaded into RAM. So if you have e.g. 100 TFRecords files, then the randomization will be much better than for a single TFRecords file.

## Libraries

In [1]:
import numpy as np
import os
import sys
import tensorflow as tf

  from ._conv import register_converters as _register_converters


## Load subjects

In [2]:
path = 'D:/mimicdb/180331_10/';
filename = path + 'filelist.csv';
subjects = np.loadtxt(filename, delimiter=',', dtype=np.int16)
print(subjects)

[ 39  55 211 212 213 216 220 224 225 226 230 237 240 248 252 259 262 276
 281 284 401 404 408 410 411 413 417 427 430 437 438 439 443 446 449 450
 452 456 466 471 472 474 476 477 480 482 484 485]


In [3]:
path_tfrecords_train = os.path.join(path, "train.tfrecords")
path_tfrecords_train

'D:/mimicdb/180331_10/train.tfrecords'

In [4]:
path_tfrecords_test = os.path.join(path, "test.tfrecords")
path_tfrecords_test

'D:/mimicdb/180331_10/test.tfrecords'

## Load data

In [161]:
def parse(serialized):
    # Define a dict with the data-names and types we expect to find in the TFRecords file.
    features = \
        {
            'ecg'  : tf.FixedLenFeature([2000,], tf.float32),
            'ppg'  : tf.FixedLenFeature([2000,], tf.float32),
            'label': tf.FixedLenFeature([1,], tf.int64),
            'index': tf.FixedLenFeature([8,], tf.int64)
        }
    
    # Parse the serialized data so we get a dict with our data.
    parsed = tf.parse_single_example(serialized=serialized, features=features)  
    
    return parsed['ecg'], parsed['ppg'], parsed['label'], parsed['index']

In [137]:
def input_fn(filenames, train, batch_size=32, buffer_size=2048):
    # Args:
    # filenames:   Filenames for the TFRecords files.
    # train:       Boolean whether training (True) or testing (False).
    # batch_size:  Return batches of this size.
    # buffer_size: Read buffers of this size. The random shuffling is done on the buffer, so it must be big enough.

    # Create a TensorFlow Dataset-object which has functionality for reading and shuffling data from TFRecords files.
    dataset = tf.data.TFRecordDataset(filenames=filenames)

    # Parse the serialized data in the TFRecords files.
    # This returns TensorFlow tensors.
    
    dataset = dataset.map(parse)
    
    if train:
        # If training then read a buffer of the given size and
        # randomly shuffle it.
        dataset = dataset.shuffle(buffer_size=buffer_size)

        # Allow infinite reading of the data.
        num_repeat = None
    else:
        # If testing then don't shuffle the data.
        # Only go through the data once.
        num_repeat = 1

    # Repeat the dataset the given number of times.
    dataset = dataset.repeat(num_repeat)
    dataset = dataset.shuffle(buffer_size)
    dataset = dataset.batch(batch_size)

    # Create an iterator for the dataset and the above modifications.
    iterator = dataset.make_one_shot_iterator()

    # Get the next batch of images and labels.
    return iterator.get_next()

In [131]:
def train_input_fn():
    return input_fn(filenames=path_tfrecords_train, train=True)

In [132]:
def test_input_fn():
    return input_fn(filenames=path_tfrecords_test, train=False)

In [162]:
sess = tf.Session()
sess.run(tf.global_variables_initializer())
feature1, feature2, labels, indices = sess.run(train_input_fn())

In [163]:
print(feature1)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
