# tf.TFRecord API使用

In [1]:
# 导入
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf
from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl,np,pd,sklearn,tf,keras:
    print(module.__name__,module.__version__)

2.0.0
sys.version_info(major=3, minor=6, micro=10, releaselevel='final', serial=0)
matplotlib 3.1.2
numpy 1.18.1
pandas 0.25.3
sklearn 0.22.1
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


## tfrecord 基础API使用

tfrecord 是一种文件格式

-> tf.train.Example

    -> tf.train.Features -> {"key": tf.train.Feature}
   
        -> tf.train.Feature -> tf.train.ByteList/FloatList/Int64List

In [2]:
# tf.train.ByteList/FloatList/Int64List
favorite_books = [name.encode('utf-8') for name in ['machine learning', 'cc150']]
favorite_books_bytelist = tf.train.BytesList(value = favorite_books)
print(favorite_books_bytelist)

hours_floatlist = tf.train.FloatList(value = [15.5, 9.5, 7.0, 8.0])
print(hours_floatlist)

age_int64list = tf.train.Int64List(value = [42])
print(age_int64list)

value: "machine learning"
value: "cc150"

value: 15.5
value: 9.5
value: 7.0
value: 8.0

value: 42



In [3]:
# tf.train.Features and tf.train.Feature
features = tf.train.Features(
    feature = {
        'favorite_books': tf.train.Feature(bytes_list = favorite_books_bytelist),
        'hours': tf.train.Feature(float_list = hours_floatlist),
        'age': tf.train.Feature(int64_list = age_int64list)
    }
)
print(features)

feature {
  key: "age"
  value {
    int64_list {
      value: 42
    }
  }
}
feature {
  key: "favorite_books"
  value {
    bytes_list {
      value: "machine learning"
      value: "cc150"
    }
  }
}
feature {
  key: "hours"
  value {
    float_list {
      value: 15.5
      value: 9.5
      value: 7.0
      value: 8.0
    }
  }
}



In [4]:
# tf.train.Example
example = tf.train.Example(features=features)
print(example)

# 序列化，压缩大小
serialized_example = example.SerializeToString()
print(serialized_example)

features {
  feature {
    key: "age"
    value {
      int64_list {
        value: 42
      }
    }
  }
  feature {
    key: "favorite_books"
    value {
      bytes_list {
        value: "machine learning"
        value: "cc150"
      }
    }
  }
  feature {
    key: "hours"
    value {
      float_list {
        value: 15.5
        value: 9.5
        value: 7.0
        value: 8.0
      }
    }
  }
}

b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A'


### 对tfrecord进行保存、读取

In [5]:
output_dir = os.path.join('tfrecord_basic')
if not  os.path.exists(output_dir):
    os.mkdir(output_dir)
filename = 'test.tfrecords'
filename_fullpath = os.path.join(output_dir, filename)

# 保存example
with tf.io.TFRecordWriter(filename_fullpath) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [6]:
# 读取
dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    print(serialized_example_tensor)

tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A', shape=(), dtype=string)
tf.Tensor(b'\n\\\n-\n\x0efavorite_books\x12\x1b\n\x19\n\x10machine learning\n\x05cc150\n\x0c\n\x03age\x12\x05\x1a\x03\n\x01*\n\x1d\n\x05hours\x12\x14\x12\x12\n\x10\x00\x00xA\x00\x00\x18A\x00\x00\xe0@\x00\x00\x00A', shape=(), dtype=string)


In [7]:
# 解析后读取
expected_features = {
    'favorite_books': tf.io.VarLenFeature(dtype=tf.string),
    'hours': tf.io.VarLenFeature(dtype=tf.float32),
    'age': tf.io.FixedLenFeature([], dtype=tf.int64),
}

dataset = tf.data.TFRecordDataset([filename_fullpath])
for serialized_example_tensor in dataset:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_features)
    print(example)
    books = tf.sparse.to_dense(example['favorite_books'], default_value=b'')
    for book in books:
        print(book.numpy().decode('UTF-8'))

{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022FA01D36D8>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022F8872D908>, 'age': <tf.Tensor: id=46, shape=(), dtype=int64, numpy=42>}
machine learning
cc150
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022FA961AA90>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022FA9614AC8>, 'age': <tf.Tensor: id=65, shape=(), dtype=int64, numpy=42>}
machine learning
cc150
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022FA01D35F8>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022F88717278>, 'age': <tf.Tensor: id=84, shape=(), dtype=int64, numpy=42>}
machine learning
cc150


In [8]:
# 压缩zip保存example
filename_fullpath_zip = filename_fullpath + '.zip'
options = tf.io.TFRecordOptions(compression_type='GZIP')
with tf.io.TFRecordWriter(filename_fullpath_zip, options) as writer:
    for i in range(3):
        writer.write(serialized_example)

In [9]:
# 对压缩后的tfrecord进行读取
dataset_zip = tf.data.TFRecordDataset([filename_fullpath_zip], compression_type='GZIP')
for serialized_example_tensor in dataset_zip:
    example = tf.io.parse_single_example(serialized_example_tensor, expected_features)
    print(example)
    books = tf.sparse.to_dense(example['favorite_books'], default_value=b'')
    for book in books:
        print(book.numpy().decode('UTF-8'))

{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022FA9614DD8>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022FA9640240>, 'age': <tf.Tensor: id=120, shape=(), dtype=int64, numpy=42>}
machine learning
cc150
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022F88717F98>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022FA01D3F28>, 'age': <tf.Tensor: id=139, shape=(), dtype=int64, numpy=42>}
machine learning
cc150
{'favorite_books': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022FA961A8D0>, 'hours': <tensorflow.python.framework.sparse_tensor.SparseTensor object at 0x0000022FA9640438>, 'age': <tf.Tensor: id=158, shape=(), dtype=int64, numpy=42>}
machine learning
cc150


## 生成tfrecords文件
将csv文件进行转化生成tfrecord文件

In [10]:
source_dir = 'generate_csv'
print(os.listdir(source_dir))

['test_00.csv', 'test_01.csv', 'test_02.csv', 'test_03.csv', 'test_04.csv', 'test_05.csv', 'test_06.csv', 'test_07.csv', 'test_08.csv', 'test_09.csv', 'train_00.csv', 'train_01.csv', 'train_02.csv', 'train_03.csv', 'train_04.csv', 'train_05.csv', 'train_06.csv', 'train_07.csv', 'train_08.csv', 'train_09.csv', 'train_10.csv', 'train_11.csv', 'train_12.csv', 'train_13.csv', 'train_14.csv', 'train_15.csv', 'train_16.csv', 'train_17.csv', 'train_18.csv', 'train_19.csv', 'valid_00.csv', 'valid_01.csv', 'valid_02.csv', 'valid_03.csv', 'valid_04.csv', 'valid_05.csv', 'valid_06.csv', 'valid_07.csv', 'valid_08.csv', 'valid_09.csv']


In [11]:
# 根据前缀划分文件
def get_filename_by_prefix(source_dir, prefix_name):
    all_files = os.listdir(source_dir)
    results = []
    for filename in all_files:
        if filename.startswith(prefix_name):
            results.append(os.path.join(source_dir, filename))
    return results

train_filenames = get_filename_by_prefix(source_dir, 'train')
valid_filenames = get_filename_by_prefix(source_dir, 'valid')
test_filenames = get_filename_by_prefix(source_dir, 'test')

import pprint
pprint.pprint(train_filenames)
pprint.pprint(valid_filenames)
pprint.pprint(test_filenames)

['generate_csv\\train_00.csv',
 'generate_csv\\train_01.csv',
 'generate_csv\\train_02.csv',
 'generate_csv\\train_03.csv',
 'generate_csv\\train_04.csv',
 'generate_csv\\train_05.csv',
 'generate_csv\\train_06.csv',
 'generate_csv\\train_07.csv',
 'generate_csv\\train_08.csv',
 'generate_csv\\train_09.csv',
 'generate_csv\\train_10.csv',
 'generate_csv\\train_11.csv',
 'generate_csv\\train_12.csv',
 'generate_csv\\train_13.csv',
 'generate_csv\\train_14.csv',
 'generate_csv\\train_15.csv',
 'generate_csv\\train_16.csv',
 'generate_csv\\train_17.csv',
 'generate_csv\\train_18.csv',
 'generate_csv\\train_19.csv']
['generate_csv\\valid_00.csv',
 'generate_csv\\valid_01.csv',
 'generate_csv\\valid_02.csv',
 'generate_csv\\valid_03.csv',
 'generate_csv\\valid_04.csv',
 'generate_csv\\valid_05.csv',
 'generate_csv\\valid_06.csv',
 'generate_csv\\valid_07.csv',
 'generate_csv\\valid_08.csv',
 'generate_csv\\valid_09.csv']
['generate_csv\\test_00.csv',
 'generate_csv\\test_01.csv',
 'generate

In [12]:
# 读取csv文件
def parse_csv_line(line, n_fields=9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1]) # 将前n-1条数据转化为x向量
    y = tf.stack(parsed_fields[-1:]) # 将最后一条转化为y向量
    return x, y

def csv_reader_dateset(filenames, n_readers=5, batch_size=32,
                       n_parse_threads=5, shuffle_buffer_size=10000):
    # 1.filename -> filename dataset
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat() # 将数据重复无限次
    # 2.filename dataset -> text dataset 一对多的关系
    dataset = dataset.interleave(
        lambda filename : tf.data.TextLineDataset(filename).skip(1),
        cycle_length=n_readers
    )
    dataset.shuffle(shuffle_buffer_size) # 将数据进行混排
    # 3.parse csv 一对一关系
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

batch_size = 32
train_set = csv_reader_dateset(train_filenames, batch_size=batch_size)
valid_set = csv_reader_dateset(valid_filenames, batch_size=batch_size)
test_set = csv_reader_dateset(test_filenames, batch_size=batch_size)

In [13]:
# 将csv读取出来的dataset转化为tfrecord格式
def serialize_example(x, y):
    """Converts x,y to tf.train.Example and serialize"""
    input_features = tf.train.FloatList(value = x)
    lable = tf.train.FloatList(value = y)
    features = tf.train.Features(
        feature = {
            'input_features': tf.train.Feature(float_list = input_features),
            'lable': tf.train.Feature(float_list = lable)
        }
    )
    example = tf.train.Example(features = features)
    return example.SerializeToString()

def csv_dataset_to_tfrecord(base_filename, dataset, n_shards,
                            steps_per_shard, compression_type = None):
    options = tf.io.TFRecordOptions(compression_type=compression_type)
    all_filenames = []
    
    for shard_id in range(n_shards):
        filename_fullpath = '{}_{:05d}-of-{:05d}'.format(
            base_filename, shard_id, n_shards)
        with tf.io.TFRecordWriter(filename_fullpath, options) as writer:
            # 从当前样本取出前steps_per_shard个
            for x_batch, y_batch in dataset.take(steps_per_shard): 
                # 对获得的每个batch进行解包
                for x_example, y_example in zip(x_batch, y_batch): 
                    # 封装成序列化的example，写到文件中
                    writer.write(serialize_example(x_example, y_example))
        all_filenames.append(filename_fullpath)
    return all_filenames


In [14]:
n_shards = 20
train_steps_per_shard = 11610 // batch_size // n_shards
valid_steps_per_shard = 3880 // batch_size // n_shards
test_steps_per_shard = 5170 // batch_size // n_shards

output_dir = 'generate_tfrecords'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
train_basename = os.path.join(output_dir, 'train')
valid_basename = os.path.join(output_dir, 'valid')
test_basename = os.path.join(output_dir, 'test')

train_tfrecord_filenames = csv_dataset_to_tfrecord(
    train_basename, train_set, n_shards, train_steps_per_shard, None)
valid_tfrecord_filenames = csv_dataset_to_tfrecord(
    valid_basename, valid_set, n_shards, valid_steps_per_shard, None)
test_tfrecord_filenames = csv_dataset_to_tfrecord(
    test_basename, test_set, n_shards, test_steps_per_shard, None)

In [15]:
# 保存成压缩后的
n_shards = 20
train_steps_per_shard = 11610 // batch_size // n_shards
valid_steps_per_shard = 3880 // batch_size // n_shards
test_steps_per_shard = 5170 // batch_size // n_shards

output_dir = 'generate_tfrecords_zip'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
train_basename = os.path.join(output_dir, 'train')
valid_basename = os.path.join(output_dir, 'valid')
test_basename = os.path.join(output_dir, 'test')

train_tfrecord_filenames = csv_dataset_to_tfrecord(
    train_basename, train_set, n_shards, train_steps_per_shard, compression_type='GZIP')
valid_tfrecord_filenames = csv_dataset_to_tfrecord(
    valid_basename, valid_set, n_shards, valid_steps_per_shard, compression_type='GZIP')
test_tfrecord_filenames = csv_dataset_to_tfrecord(
    test_basename, test_set, n_shards, test_steps_per_shard, compression_type='GZIP')

## 读取tfrecord文件

In [16]:
pprint.pprint(train_tfrecord_filenames)
pprint.pprint(valid_tfrecord_filenames)
pprint.pprint(test_tfrecord_filenames)

['generate_tfrecords_zip\\train_00000-of-00020',
 'generate_tfrecords_zip\\train_00001-of-00020',
 'generate_tfrecords_zip\\train_00002-of-00020',
 'generate_tfrecords_zip\\train_00003-of-00020',
 'generate_tfrecords_zip\\train_00004-of-00020',
 'generate_tfrecords_zip\\train_00005-of-00020',
 'generate_tfrecords_zip\\train_00006-of-00020',
 'generate_tfrecords_zip\\train_00007-of-00020',
 'generate_tfrecords_zip\\train_00008-of-00020',
 'generate_tfrecords_zip\\train_00009-of-00020',
 'generate_tfrecords_zip\\train_00010-of-00020',
 'generate_tfrecords_zip\\train_00011-of-00020',
 'generate_tfrecords_zip\\train_00012-of-00020',
 'generate_tfrecords_zip\\train_00013-of-00020',
 'generate_tfrecords_zip\\train_00014-of-00020',
 'generate_tfrecords_zip\\train_00015-of-00020',
 'generate_tfrecords_zip\\train_00016-of-00020',
 'generate_tfrecords_zip\\train_00017-of-00020',
 'generate_tfrecords_zip\\train_00018-of-00020',
 'generate_tfrecords_zip\\train_00019-of-00020']
['generate_tfrecords

In [17]:
# 定义期望的特征
expected_features = {
    'input_features': tf.io.FixedLenFeature([8], dtype=tf.float32),
    'lable': tf.io.FixedLenFeature([1], dtype=tf.float32)
}

# 解析example
def parse_example(serialize_example):
    example = tf.io.parse_single_example(serialize_example, expected_features)
    return example['input_features'], example['lable']

# 从文件列表到dataset的转变
def tfrecords_reader_dateset(filenames, n_readers=5, batch_size=32,
                             n_parse_threads=5, shuffle_buffer_size=10000):
    # 1.filename -> filename dataset
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat() # 将数据重复无限次
    # 2.filename dataset -> tfrecord dataset 一对多的关系
    dataset = dataset.interleave(
        lambda filename : tf.data.TFRecordDataset(filename, compression_type='GZIP'),
        cycle_length=n_readers
    )
    dataset.shuffle(shuffle_buffer_size) # 将数据进行混排
    # 3.parse csv 一对一关系
    dataset = dataset.map(parse_example, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

In [18]:
# test
tfrecords_train = tfrecords_reader_dateset(train_tfrecord_filenames, batch_size=3)
for x_batch, y_batch in tfrecords_train.take(2):
    print(x_batch)
    print(y_batch)

tf.Tensor(
[[ 0.4240821   0.91296333 -0.04437482 -0.15297213 -0.24727628 -0.10539167
   0.86126745 -1.335779  ]
 [ 0.04326301 -1.0895426  -0.38878718 -0.10789865 -0.68186635 -0.0723871
  -0.8883662   0.8213992 ]
 [ 0.04326301 -1.0895426  -0.38878718 -0.10789865 -0.68186635 -0.0723871
  -0.8883662   0.8213992 ]], shape=(3, 8), dtype=float32)
tf.Tensor(
[[3.955]
 [1.426]
 [1.426]], shape=(3, 1), dtype=float32)
tf.Tensor(
[[-1.0591781   1.3935647  -0.02633197 -0.1100676  -0.6138199  -0.09695935
   0.3247131  -0.03747724]
 [ 0.8015443   0.27216142 -0.11624393 -0.20231152 -0.5430516  -0.02103962
  -0.5897621  -0.08241846]
 [-1.0775077  -0.4487407  -0.5680568  -0.14269263 -0.09666677  0.12326469
  -0.31448638 -0.4818959 ]], shape=(3, 8), dtype=float32)
tf.Tensor(
[[0.672]
 [3.226]
 [0.978]], shape=(3, 1), dtype=float32)


In [19]:
# 生成训练中dataset
batch_size = 32
tfrecords_train_set = tfrecords_reader_dateset(
    train_tfrecord_filenames, batch_size=batch_size)
tfrecords_valid_set = tfrecords_reader_dateset(
    valid_tfrecord_filenames, batch_size=batch_size)
tfrecords_test_set = tfrecords_reader_dateset(
    test_tfrecord_filenames, batch_size=batch_size)

## 在keras中使用

In [20]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu',input_shape=[8]),
    keras.layers.Dense(1)
])
model.summary()

model.compile(loss='mean_squared_error',optimizer='adam')

callbacks = [keras.callbacks.EarlyStopping(patience=5,min_delta=1e-2)]
history = model.fit(tfrecords_train_set,
                    validation_data = tfrecords_valid_set,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs = 100,
                    callbacks = callbacks)

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 30)                270       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 31        
Total params: 301
Trainable params: 301
Non-trainable params: 0
_________________________________________________________________
Train for 348 steps, validate for 120 steps
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100


In [21]:
model.evaluate(tfrecords_test_set, steps = 5160 // batch_size)



0.3255322992524005