In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.3.1
sys.version_info(major=3, minor=6, micro=7, releaselevel='final', serial=0)
matplotlib 3.3.2
numpy 1.18.4
pandas 1.1.4
sklearn 0.23.2
tensorflow 2.3.1
tensorflow.keras 2.4.0


In [3]:
# tfrecord 文件格式
# -> tf.train.Example
#   -> tf.train.Features -> {"key": tf.train.Features}
#     -> tf.train.Feature.ByteList/FloatList/Int64List

favorite_books = [name.encode("utf-8") for name in ["machine learning", "cc150"]]
favorite_books_bytelist = tf.train.BytesList(value=favorite_books)
print(favorite_books_bytelist)

value: "machine learning"
value: "cc150"



In [5]:
hours_floatlist = tf.train.FloatList(value=[15.5, 9.5, 5.])
print(hours_floatlist)

value: 15.5
value: 9.5
value: 5.0



In [6]:
age_int64list = tf.train.Int64List(value=[15, 9, 5])
print(age_int64list)

value: 15
value: 9
value: 5



In [8]:
features = tf.train.Features(
feature = {
    "favorite_books": tf.train.Feature(bytes_list=favorite_books_bytelist),
    "hours": tf.train.Feature(float_list=hours_floatlist),
    "age": tf.train.Feature(int64_list=age_int64list)
})
print(features)

feature {
  key: "age"
  value {
    int64_list {
      value: 15
      value: 9
      value: 5
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 15.5
      value: 9.5
      value: 5.0
    }
  }
}



In [9]:
example = tf.train.Example(features=features)
print(example)

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 15
        value: 9
        value: 5
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 9.5
        value: 5.0
      }
    }
  }
}



In [10]:
serialized_example = example.SerializePartialToString()
print(serialized_example)

b'\nZ\n\x0e\n\x03age\x12\x07\x1a\x05\n\x03\x0f\t\x05\n\x19\n\x05hours\x12\x10\x12\x0e\n\x0c\x00\x00xA\x00\x00\x18A\x00\x00\xa0@\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150'


In [11]:
output_dir = "tfrecord_basic"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
filename = "test.tfrecords"
filename_fullpath = os.path.join(output_dir, filename)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [13]:
# Read data
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)

tf.Tensor(b'\nZ\n\x0e\n\x03age\x12\x07\x1a\x05\n\x03\x0f\t\x05\n\x19\n\x05hours\x12\x10\x12\x0e\n\x0c\x00\x00xA\x00\x00\x18A\x00\x00\xa0@\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150', shape=(), dtype=string)
tf.Tensor(b'\nZ\n\x0e\n\x03age\x12\x07\x1a\x05\n\x03\x0f\t\x05\n\x19\n\x05hours\x12\x10\x12\x0e\n\x0c\x00\x00xA\x00\x00\x18A\x00\x00\xa0@\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150', shape=(), dtype=string)
tf.Tensor(b'\nZ\n\x0e\n\x03age\x12\x07\x1a\x05\n\x03\x0f\t\x05\n\x19\n\x05hours\x12\x10\x12\x0e\n\x0c\x00\x00xA\x00\x00\x18A\x00\x00\xa0@\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150', shape=(), dtype=string)


In [15]:
# Deserialize
expected_features = {
    "favorite_books": tf.io.VarLenFeature(dtype=tf.string),
    "hours": tf.io.VarLenFeature(dtype=tf.float32),
    "age": tf.io.FixedLenFeature([], dtype=tf.int64)
}
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    books = tf.sparse.to_dense(example["favorite_books"],
                               default_value=b"")
    for book in books:
        print(book.numpy().decode("UTF-8"))

InvalidArgumentError: Feature: age_int64list (data type: int64) is required but could not be found. [Op:ParseExampleV2]

In [16]:
filename_fullpath_zip = filename_fullpath + '.zip'
options = tf.io.TFRecordOptions(compression_type = "GZIP")
with tf.io.TFRecordWriter(filename_fullpath_zip, options) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [17]:
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip], 
                                      compression_type= "GZIP")
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    books = tf.sparse.to_dense(example["favorite_books"],
                               default_value=b"")
    for book in books:
        print(book.numpy().decode("UTF-8"))

InvalidArgumentError: Feature: age_int64list (data type: int64) is required but could not be found. [Op:ParseExampleV2]