# tf.dataset的使用

In [1]:
import matplotlib as mlt
import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import pandas as pd

import sys
import os
import time

import tensorflow as tf
from tensorflow import keras
print(tf.__version__)

2.0.0


tf.data.Dataset.from_tensor_slices

In [2]:
# 从numpy中构建
dataset = tf.data.Dataset.from_tensor_slices(np.arange(10))
print(dataset)
for item in dataset:
    print(item)

<TensorSliceDataset shapes: (), types: tf.int32>
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


In [3]:
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [4]:
dataset2 = dataset.interleave(lambda v: tf.data.Dataset.from_tensor_slices(v),
                             cycle_length=5,
                             block_length=5)
for item in dataset2:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(0, shape=(), dtype

In [6]:
# 从元组中构建

x = np.array([[1., 2.], [3., 4.], [5., 6.]])
y = np.array(["cat", "dog", "fox"])
dataset3 = tf.data.Dataset.from_tensor_slices((x, y))
print(dataset3)
for item_x, item_y in dataset3:
    print(item_x.numpy(), item_y.numpy())

<TensorSliceDataset shapes: ((2,), ()), types: (tf.float64, tf.string)>
[1. 2.] b'cat'
[3. 4.] b'dog'
[5. 6.] b'fox'


In [8]:
# 从字典中构建
dataset4 = tf.data.Dataset.from_tensor_slices({"feature":x, "label": y})
for item in dataset4:
    print(item["feature"].numpy(), item["label"].numpy())

[1. 2.] b'cat'
[3. 4.] b'dog'
[5. 6.] b'fox'


从csv文件中读取数据

In [11]:
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
housing = fetch_california_housing()
x_train_all, x_test, y_train_all, y_test = train_test_split(housing.data,
                                                           housing.target,
                                                           test_size=0.25,
                                                           random_state=7)
x_train, x_val, y_train, y_val = train_test_split(x_train_all,
                                                  y_train_all,
                                                  test_size=0.25,
                                                  random_state=7)
print("train shape: ", x_train.shape)
print("validation shape: ", x_val.shape)
print("test shape: ", x_test.shape)

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_val_scaled = scaler.transform(x_val)
x_test_scaled = scaler.transform(x_test)

train shape:  (11610, 8)
validation shape:  (3870, 8)
test shape:  (5160, 8)


In [17]:
output_dir = "generate csv"
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
def save_to_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    file_names = []
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        file_names.append(part_csv)
        with open(part_csv, "wt", encoding='utf-8') as f:
            if header:
                f.write(header+"\n")
            for row_idx in row_indices:
                f.write(",".join(repr(col) for col in data[row_idx]))
                f.write("\n")
    return file_names

train_data = np.c_[x_train_scaled, y_train]
val_data = np.c_[x_val_scaled, y_val]
test_data = np.c_[x_test_scaled, y_test]
header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_files = save_to_csv(output_dir, train_data, "train", header_str, n_parts=20)
val_files = save_to_csv(output_dir, val_data, "val", header_str, n_parts=10)
test_files = save_to_csv(output_dir, test_data, "test", header_str, n_parts=10)

tf.io.decode_csv的使用

tensorflow中读完系列文件的过程
1. 将所有需要读取的文件的文件名写入dataset
2. 对于文件名生成的dataset去读取文件内容形成文件内容的satasets
3. 解析文件中的内容（tf.io.decoder_csv()）

In [19]:
# 针对读取文件名有一个专门的api
# keras.data.Dataset.list_files
filename_dataset = tf.data.Dataset.list_files(train_files)
for filename in filename_dataset:
    print(filename)

tf.Tensor(b'generate csv\\train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_08.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_09.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generate csv\

In [25]:
# interleave会遍历filename_dataset的每一个元素，进行操作，然后将操作的结果组合起来
# tf.data.TextLineDataset会按行去去读文件中的文本内容

n_readers = 5
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1),# 加skip是跳过第一行，避免header被读取到dataset中
                                    cycle_length=n_readers)
for item in dataset.take(15):
    print(item.numpy())

b'-0.7628576678376784,0.8238167412601372,0.10041321355962941,-0.06324778224149417,-0.2639595301791804,-0.06119686943141775,0.5297853063545601,-0.1085175850218531,0.719'
b'0.8095613521801492,0.34518428537112195,0.22347987936517727,-0.10669826574327357,-0.17047084206891044,0.008573079922849763,-0.841838842223145,0.739231082317489,2.349'
b'-0.45521966900167576,-0.2929923224808983,-0.3987316967655819,0.048342385355644925,0.8612736888777626,-0.03406098870043033,-0.6873555763423803,0.5996018900498323,2.042'
b'-0.5153188550167574,1.6215375010751625,-0.5674383914596058,-0.09192162409389779,-0.6101203483172071,-0.1362930022629114,1.0353669037825188,-1.335259774230543,2.113'
b'0.24184624203240349,1.6215375010751625,0.009664811030936506,-0.05126299611752959,0.0030306872168157756,-0.12848071177796017,-0.7575752426518185,0.5547210782495127,5.00001'
b'-0.8602543240469787,0.5845005133156296,-0.5401804329592943,-0.29451363578528666,-0.18984237203770513,-0.10174016540818816,-0.6920368874296753,0.574668

In [32]:
 def parse_csv_line(line, n_fields=9):
        defs = [tf.constant(np.nan)] * n_fields
        parse_line = tf.io.decode_csv(line, record_defaults=defs)
        x = tf.stack(parse_line[:-1])
        y = tf.stack(parse_line[-1:])
        return x, y
    
parse_csv_line(b'-0.7628576678376784,0.8238167412601372,0.10041321355962941,-0.06324778224149417,-0.2639595301791804,-0.06119686943141775,0.5297853063545601,-0.1085175850218531,0.719',
              n_fields=9)

(<tf.Tensor: id=492, shape=(8,), dtype=float32, numpy=
 array([-0.7628577 ,  0.8238167 ,  0.10041321, -0.06324778, -0.26395953,
        -0.06119687,  0.52978534, -0.10851759], dtype=float32)>,
 <tf.Tensor: id=493, shape=(1,), dtype=float32, numpy=array([0.719], dtype=float32)>)

In [33]:
# 将前面的整合一下，定义出一个完整的函数
def csv_reader_dataset(filenames, n_readers=5, batch_size=32, n_parse_thread=5,
                      shuffler_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
                                cycle_length=n_readers)
    dataset.shuffle(shuffler_buffer_size)
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_thread)
    dataset = dataset.batch(batch_size)
    return dataset

train_set = csv_reader_dataset(train_files, batch_size=3)
for x_batch, y_batch in train_set.take(5):
    print("x: ",x_batch)
    print("y: ",y_batch)

x:  tf.Tensor(
[[-0.5153189   1.6215374  -0.56743836 -0.09192163 -0.61012036 -0.13629301
   1.0353669  -1.3352598 ]
 [ 1.1558194  -0.13344817  0.6085721   0.04220942  1.167007    0.04454635
  -0.73885     0.9087808 ]
 [-0.7826967  -0.7716248  -0.0938434   0.14471786  0.18748134 -0.02488174
   0.20677485  0.33031702]], shape=(3, 8), dtype=float32)
y:  tf.Tensor(
[[2.113]
 [2.148]
 [0.9  ]], shape=(3, 1), dtype=float32)
x:  tf.Tensor(
[[-1.6167846  -0.45253646 -0.02914891  0.0877159  -0.31786293  0.02373059
   1.007279   -1.3452333 ]
 [ 1.1680932  -1.4895735   0.3828349  -0.23364641  4.9141345   0.00602934
   1.08218    -1.3352598 ]
 [-1.2395775   1.8608537  -1.1683363   0.02267936  0.21611752 -0.06696921
   0.9979164  -1.4050744 ]], shape=(3, 8), dtype=float32)
y:  tf.Tensor(
[[1.125]
 [2.724]
 [2.375]], shape=(3, 1), dtype=float32)
x:  tf.Tensor(
[[ 0.8274959  -0.3727644  -0.02113135 -0.2520319   1.9907181   0.11129898
   0.810664   -1.1756835 ]
 [ 0.16571708  0.5845005  -0.03475306 -0

In [36]:
train_set = csv_reader_dataset(train_files)
val_set = csv_reader_dataset(val_files)
test_set = csv_reader_dataset(test_files)

In [41]:
model = keras.models.Sequential([
    keras.layers.Input(shape=[8]),
    keras.layers.Dense(300, activation='relu'),
    keras.layers.Dense(300, activation="relu"),
    keras.layers.Dense(1)
])
model.compile("adam",loss='mse')
history = model.fit(train_set,
                epochs=10,
                steps_per_epoch = 11160 // 32,
                validation_steps = 3870 // 32,
                validation_data=val_set)

Train for 348 steps, validate for 120 steps
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
model.evaluate(test_set, steps=5160//32)



0.32561101787578983