# Read Data
---

# Imports

In [2]:
import os
import tensorflow as tf

In [3]:
data_dir = os.environ["YT8M_DATA"]

# Frame-level

In [4]:
def parse_sequence_example(s):
    return tf.train.SequenceExample.FromString(s.numpy())

def extract_id(s):
    se = parse_sequence_example(s)
    return se.context.feature["id"].bytes_list.value[0]

def extract_labels(s):
    se = parse_sequence_example(s)
    return se.context.feature["labels"].int64_list.value

def extract_frames_features(s, feature):
    se = parse_sequence_example(s)
    frames = se.feature_lists.feature_list[feature].feature    

    return tf.expand_dims(
        tf.concat([
            tf.expand_dims(
                tf.cast(
                    tf.io.decode_raw(frame.bytes_list.value[0], tf.uint8),
                    tf.float64
                ),
                axis=0
            ) for frame in frames
        ]
        , axis=0)
    , axis=0 )


def parse(s):
    
    vid = tf.py_function(
        extract_id,
        (s,),
        tf.string
    )
    
    labels = tf.py_function(
        extract_labels,
        (s,),
        tf.int64
    )
    
    rgb = tf.py_function(
        lambda x: extract_frames_features(x, "rgb"),
        (s,),
        tf.float64
    )
    
    audio = tf.py_function(
        lambda x: extract_frames_features(x, "audio"),
        (s,),
        tf.float64
    )
      
    return vid, labels, rgb, audio

In [5]:
tf_records = os.path.join(data_dir, "frame/train00.tfrecord")

In [6]:
dataset = tf.data.TFRecordDataset(tf_records).map(parse)

In [7]:
for r in dataset.take(1):
    vid, labels, rgb, audio = r 

W0709 16:19:27.922098 140286130886400 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0709 16:19:27.927050 140286130886400 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0709 16:19:27.930767 140286130886400 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string
W0709 16:19:28.046549 140286130886400 backprop.py:842] The dtype of the watched tensor must be floating (e.g. tf.float32), got tf.string


In [8]:
vid

<tf.Tensor: id=2390, shape=(), dtype=string, numpy=b'op00'>

In [9]:
labels

<tf.Tensor: id=2391, shape=(4,), dtype=int64, numpy=array([ 82, 103, 346, 350])>

In [10]:
rgb

<tf.Tensor: id=2392, shape=(1, 234, 1024), dtype=float64, numpy=
array([[[179., 158., 143., ...,  14., 135., 123.],
        [201., 141., 141., ...,  49., 134.,  73.],
        [138., 169., 115., ...,  35., 215., 103.],
        ...,
        [133., 174.,  84., ..., 119.,  82.,  91.],
        [121., 127., 100., ...,  56., 199., 165.],
        [112., 122.,  77., ..., 141., 195., 245.]]])>

In [11]:
audio

<tf.Tensor: id=2393, shape=(1, 234, 128), dtype=float64, numpy=
array([[[119., 123., 161., ..., 204., 190., 130.],
        [ 80., 126., 141., ..., 144.,  61.,  43.],
        [126., 114., 182., ..., 185.,  11., 184.],
        ...,
        [121., 202., 249., ...,  74.,  59., 255.],
        [ 46., 255., 213., ...,   0., 131., 167.],
        [ 49., 230., 129., ..., 150.,   0.,   0.]]])>