In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

import numpy as np
import IPython.display as display

tf.compat.v1.enable_eager_execution()

tf.__version__

'1.13.1'

**Cách load dữ liệu truyền thống**
<img src="./images/traditional_load_data.png">

**Cách load dữ liệu bằng cách Pipeline**
<img src="./images/pipeline_load_data.png">
Trong quá trình làm việc với TensorFlow, cách dễ dàng nhất để cấu hình một input pipeline hiệu quả là sử dụng định dạng TFRecord, một định dạng dữ liệu kiểu nhị phân được hỗ trợ bởi TensorFlow.

### Protocol Buffers & TFRecord
**Protocol Buffers** định nghĩa cách serialize những dữ liệu có cấu trúc thành dữ liệu nhị phân. <br>
Cấu trúc dữ liệu được viết dưới dạng protocol bufer message và lưu thành tệp .proto. Có thể coi protocol buffer là 1 optimize version của XML nhưng là cho việc xây dựng cấu trúc dữ liệu. Ví dụ:

**TFRecord** là một định dạng đơn giản để lưu trữ một chuỗi các bản ghi nhị phân. 1 tệp TFRecord chứa một chuỗi các records của observation. Các tập chỉ có thể được đọc tuần tự.<br>

TensorFlow sử dụng Protocol Buffers để định nghĩa cấu trúc của dữ liệu dạng TFRecord. <br>
Một trường dữ liệu (Ví dụ: ảnh đầu vào và nhãn của ảnh đó) được gọi là một Example<br>

### 1 tf.Example là 1 ánh xạ {"string": tf.train.Feature}.** <br>
**Data types for tf.Example**

<img src="./images/Tf_Feature_datatype.png">

In order to convert a standard TensorFlow type to a tf.Example-compatible tf.train.Feature, you can use the shortcut functions below. Note that each function takes a scalar input value and returns a tf.train.Feature containing one of the three list types above:

In [2]:
# The following functions can be used to convert a value to a type compatible
# with tf.Example.

def _bytes_feature(value):
    """Returns a bytes_list from a string / byte."""
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    """Returns a float_list from a float / double."""
    return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
    """Returns an int64_list from a bool / enum / int / uint."""
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

In [3]:
print(_bytes_feature(b'test_string'))
print(_bytes_feature(u'test_bytes'.encode('utf-8')))

print(_float_feature(np.exp(1)))

print(_int64_feature(True))
print(_int64_feature(1))

bytes_list {
  value: "test_string"
}

bytes_list {
  value: "test_bytes"
}

float_list {
  value: 2.7182817459106445
}

int64_list {
  value: 1
}

int64_list {
  value: 1
}



In [4]:
feature = _float_feature(np.exp(1))
feature.SerializeToString()

b'\x12\x06\n\x04T\xf8-@'

### Example

In this notebook, you will create a dataset using NumPy.

This dataset will have 4 features:
- a boolean feature, False or True with equal probability
- an integer feature uniformly randomly chosen from [0, 5]
- a string feature generated from a string table by using the integer feature as an index
- a float feature from a standard normal distribution

Consider a sample consisting of 10,000 independently and identically distributed observations from each of the above distributions:

In [5]:
# The number of observations in the dataset.
n_observations = int(1e4)

# Boolean feature, encoded as False or True.
feature0 = np.random.choice([False, True], n_observations)

# Integer feature, random from 0 to 4.
feature1 = np.random.randint(0, 5, n_observations)

# String feature
strings = np.array([b'cat', b'dog', b'chicken', b'horse', b'goat'])
feature2 = strings[feature1]

# Float feature, from a standard normal distribution
feature3 = np.random.randn(n_observations)

### Tổng quát quá trình tạo 1 tf.Example message

1. Within each observation, each value needs to be converted to a tf.train.Feature containing one of the 3 compatible types, using one of the functions above.
2. You create a map (dictionary) from the feature name string to the encoded feature value produced in #1.
3. The map produced in step 2 is converted to a Features message.

In [6]:
# Serialize a sample from observation to a Example
def serialize_example(feature0, feature1, feature2, feature3):
    feature = {
      'feature0': _int64_feature(feature0),
      'feature1': _int64_feature(feature1),
      'feature2': _bytes_feature(feature2),
      'feature3': _float_feature(feature3),
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

example = serialize_example(False, 4, b'goat', 0.9876)
print(example)

b'\nR\n\x14\n\x08feature2\x12\x08\n\x06\n\x04goat\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04[\xd3|?\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x04'


## TFRecord files using tf.data
### **Writing a TFRecord file**<br>
Cách đơn giản nhất để tạo dataset từ các features là sử dụng method **from_tensor_slices**<br>
Từ đó có thể can thiệp để tiền xử lý thông qua method **tf.data.Dataset.map**, áp dụng cho mỗi element trong Dataset.


In [7]:
dataset = tf.data.Dataset.from_tensor_slices((feature0, feature1, feature2, feature3))

In [8]:
for feature0, feature1, feature2, feature3 in dataset.take(1):
    serialized_example = serialize_example(feature0, 
                                           feature1, 
                                           feature2, 
                                           feature3)
    print(serialized_example)

Instructions for updating:
Colocations handled automatically by placer.
b'\nU\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x01\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x02\n\x17\n\x08feature2\x12\x0b\n\t\n\x07chicken\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04\xa6y\xbc?'


In order to write the data into a TFRecords file we need to convert each data point into a byte-string following the above process and write it into file using a tf.io.TFRecordsWriter.

In [11]:
file_path = 'data.tfrecords'
with tf.io.TFRecordWriter(file_path) as writer:
    for feature0, feature1, feature2, feature3 in dataset:
        serialized_example = serialize_example(feature0, 
                                     feature1, 
                                     feature2, 
                                     feature3)    
        writer.write(serialized_example)

### **Read a TFRecord file**<br>

In [13]:
file_paths = [file_path] # We have only one file
raw_dataset = tf.data.TFRecordDataset(file_paths)
raw_dataset

<TFRecordDatasetV1 shapes: (), types: tf.string>

In [14]:
for raw_record in raw_dataset.take(10):
    print(repr(raw_record))

<tf.Tensor: id=90071, shape=(), dtype=string, numpy=b'\nU\n\x17\n\x08feature2\x12\x0b\n\t\n\x07chicken\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04\xa6y\xbc?\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x01\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x02'>
<tf.Tensor: id=90073, shape=(), dtype=string, numpy=b'\nS\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x03\n\x15\n\x08feature2\x12\t\n\x07\n\x05horse\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04\xb33\x97\xbf'>
<tf.Tensor: id=90075, shape=(), dtype=string, numpy=b'\nQ\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x00\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x01\n\x13\n\x08feature2\x12\x07\n\x05\n\x03dog\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04\xad\xc9\x91\xbf'>
<tf.Tensor: id=90077, shape=(), dtype=string, numpy=b'\nR\n\x11\n\x08feature0\x12\x05\x1a\x03\n\x01\x01\n\x11\n\x08feature1\x12\x05\x1a\x03\n\x01\x04\n\x14\n\x08feature2\x12\x08\n\x06\n\x04goat\n\x14\n\x08feature3\x12\x08\x12\x06\n\x04\x03\xecL\xbd

In [15]:
feature_description = {
    'feature0': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'feature1': tf.io.FixedLenFeature([], tf.int64, default_value=0),
    'feature2': tf.io.FixedLenFeature([], tf.string, default_value=''),
    'feature3': tf.io.FixedLenFeature([], tf.float32, default_value=0.0),
}

def _parse_function(example_proto):
    # Parse the input `tf.Example` proto using the dictionary above.
    return tf.io.parse_single_example(example_proto, feature_description)

In [16]:
parsed_dataset = raw_dataset.map(_parse_function)
parsed_dataset

<DatasetV1Adapter shapes: {feature0: (), feature1: (), feature2: (), feature3: ()}, types: {feature0: tf.int64, feature1: tf.int64, feature2: tf.string, feature3: tf.float32}>

In [17]:
for parsed_record in parsed_dataset.take(10):
    print(repr(parsed_record))

{'feature0': <tf.Tensor: id=90119, shape=(), dtype=int64, numpy=1>, 'feature1': <tf.Tensor: id=90120, shape=(), dtype=int64, numpy=2>, 'feature2': <tf.Tensor: id=90121, shape=(), dtype=string, numpy=b'chicken'>, 'feature3': <tf.Tensor: id=90122, shape=(), dtype=float32, numpy=1.4724624>}
{'feature0': <tf.Tensor: id=90127, shape=(), dtype=int64, numpy=0>, 'feature1': <tf.Tensor: id=90128, shape=(), dtype=int64, numpy=3>, 'feature2': <tf.Tensor: id=90129, shape=(), dtype=string, numpy=b'horse'>, 'feature3': <tf.Tensor: id=90130, shape=(), dtype=float32, numpy=-1.1812652>}
{'feature0': <tf.Tensor: id=90135, shape=(), dtype=int64, numpy=0>, 'feature1': <tf.Tensor: id=90136, shape=(), dtype=int64, numpy=1>, 'feature2': <tf.Tensor: id=90137, shape=(), dtype=string, numpy=b'dog'>, 'feature3': <tf.Tensor: id=90138, shape=(), dtype=float32, numpy=-1.1389672>}
{'feature0': <tf.Tensor: id=90143, shape=(), dtype=int64, numpy=1>, 'feature1': <tf.Tensor: id=90144, shape=(), dtype=int64, numpy=4>, 'f