In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

print (tf.__version__)
print (sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print (module.__name__, module.__version__)

2.0.0-beta1
sys.version_info(major=3, minor=6, micro=8, releaselevel='final', serial=0)
matplotlib 3.1.1
numpy 1.16.4
pandas 0.24.2
sklearn 0.21.2
tensorflow 2.0.0-beta1
tensorflow.python.keras.api._v2.keras 2.2.4-tf


In [5]:
# tfrecord 文件格式
# -> tf.train.Example
# 每个example 都是 tf.train.Features -> dict形式 {"key": tf.train.Feature}
#      tf.train.Feature -> tf.train.ByteList/FloatList/Int64List
'''
tfrecord文件格式:
    tf.train.Example{
        tf.train.Features{
            key:tf.train.Feature{
                tf.rain.ByteList/FloatList/Int64List
            }
        }
    }
'''


favorite_books = [name.encode('utf-8') 
                for name in ["machine learning", "cc150"]]
print(favorite_books)
favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print (favorite_books_bytelist)


hours_floatlist = tf.train.FloatList(value =[1.5, 2.3, 3.5, 4.4])   #精度问题
print (hours_floatlist)

age_int64list = tf.train.Int64List(value = [11,12,13,14])
print (age_int64list)

features = tf.train.Features(
        feature = {
            "favorite_books": tf.train.Feature(
                bytes_list = favorite_books_bytelist),
            "hours": tf.train.Feature(
                float_list=hours_floatlist),
            "age": tf.train.Feature(
                int64_list = age_int64list),
        }
)
print (features)

[b'machine learning', b'cc150']
value: "machine learning"
value: "cc150"

value: 1.5
value: 2.299999952316284
value: 3.5
value: 4.400000095367432

value: 11
value: 12
value: 13
value: 14

feature {
  key: "age"
  value {
    int64_list {
      value: 11
      value: 12
      value: 13
      value: 14
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 1.5
      value: 2.299999952316284
      value: 3.5
      value: 4.400000095367432
    }
  }
}



In [36]:
example = tf.train.Example(features = features)
print (example)

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 11
        value: 12
        value: 13
        value: 14
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 1.5
        value: 2.299999952316284
        value: 3.5
        value: 4.400000095367432
      }
    }
  }
}



In [37]:
serialized_example = example.SerializeToString()
print (serialized_example)

b'\n_\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0f\n\x03age\x12\x08\x1a\x06\n\x04\x0b\x0c\r\x0e\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00\xc0?33\x13@\x00\x00`@\xcd\xcc\x8c@'


In [41]:
output_path = 'tfrecord_basic'
if not os.path.exists(output_path):
    os.mkdir(output_path)
filename = 'test.tf.records'
filename_fullpath = os.path.join(output_path,filename)
#print (filename_fullpath)
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [44]:
dataset = tf.data.TFRecordDataset([filename_fullpath]) #dataset 读取后变为tensor，内容为 serilized_example
for serialized_example_tensor  in dataset:
    print (serialized_example_tensor)
    print ('\n')

tf.Tensor(b'\n_\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0f\n\x03age\x12\x08\x1a\x06\n\x04\x0b\x0c\r\x0e\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00\xc0?33\x13@\x00\x00`@\xcd\xcc\x8c@', shape=(), dtype=string)


tf.Tensor(b'\n_\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0f\n\x03age\x12\x08\x1a\x06\n\x04\x0b\x0c\r\x0e\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00\xc0?33\x13@\x00\x00`@\xcd\xcc\x8c@', shape=(), dtype=string)


tf.Tensor(b'\n_\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0f\n\x03age\x12\x08\x1a\x06\n\x04\x0b\x0c\r\x0e\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00\xc0?33\x13@\x00\x00`@\xcd\xcc\x8c@', shape=(), dtype=string)




In [59]:
"""
key:value定义的expected_features
tf.io.VarLenFeature(): 读取的类型定义

"""

expected_features = {
    "favorite_books": tf.io.VarLenFeature(dtype = tf.string),
    "hours": tf.io.VarLenFeature(dtype = tf.float32),
    "age" : tf.io.VarLenFeature(dtype= tf.int64),
}

for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(
    serialized_example_tensor,
    expected_features)
    print (tf.sparse.to_dense(example["hours"],default_value=0.0),"\n") #sparse 转 denses需要填默认值
    

tf.Tensor([1.5 2.3 3.5 4.4], shape=(4,), dtype=float32) 

tf.Tensor([1.5 2.3 3.5 4.4], shape=(4,), dtype=float32) 

tf.Tensor([1.5 2.3 3.5 4.4], shape=(4,), dtype=float32) 



In [64]:
## zip压缩
filename_fullpath_zip = filename_fullpath+'.zip'
options = tf.io.TFRecordOptions(compression_type= 'GZIP') #
with tf.io.TFRecordWriter(filename_fullpath_zip, "GZIP") as writer:
    for i in range (3):
        writer.write(serialized_example)
print ("done")

done


In [68]:
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip],compression_type="GZIP")
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(
        serialized_example_tensor,
        expected_features)
    books = tf.sparse.to_dense(example["favorite_books"],default_value=b"")
    for book in books:
        print (book.numpy().decode("UTF-8"))

machine learning
cc150
machine learning
cc150
machine learning
cc150


In [11]:
tf.train.Example?

[0;31mInit signature:[0m [0mtf[0m[0;34m.[0m[0mtrain[0m[0;34m.[0m[0mExample[0m[0;34m([0m[0mself[0m[0;34m,[0m [0;34m/[0m[0;34m,[0m [0;34m*[0m[0margs[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m      A ProtocolMessage
[0;31mFile:[0m           /anaconda3/envs/tylab/lib/python3.6/site-packages/tensorflow/core/example/example_pb2.py
[0;31mType:[0m           GeneratedProtocolMessageType
[0;31mSubclasses:[0m     
