## **2.1 生成csv文件**

In [1]:
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os, sys, time
import tensorflow as tf
from tensorflow import keras

In [2]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
from sklearn.model_selection import train_test_split
X_train_all, X_test, y_train_all, y_test = train_test_split(housing.data, housing.target, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train_all, y_train_all, random_state=1)
from sklearn.preprocessing import StandardScaler
std_scaler = StandardScaler()
X_train_scaled = std_scaler.fit_transform(X_train)
X_val_scaled = std_scaler.transform(X_val)
X_test_scaled = std_scaler.transform(X_test)

In [3]:
output_dir = './generate_csv'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

def save_to_csv(output_dir, data, name_prefix,
                header=None, n_parts=10):
    path_format = os.path.join(output_dir, '{}_{:02d}.csv')
    filenames = []
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, 'wt', encoding='utf-8') as f:
            if header is not None:
                f.write(header + '\n')
            for row_index in row_indices:
                f.write(','.join(repr(col) for col in data[row_index]))
                f.write('\n')
    return filenames

train_data = np.c_[X_train_scaled, y_train]
valid_data = np.c_[X_val_scaled, y_val]
header_cols = housing.feature_names + ['HouseValue']
header_str = ','.join(header_cols)
train_filenames = save_to_csv(output_dir, train_data, 'train', header_str, n_parts=20)
valid_filenames = save_to_csv(output_dir, valid_data, 'valid', header_str, n_parts=10)

## **2.2 生成csv文件备用**

- 将filenames生成dataset
- 处理filenames里面的每一个元素生成一个dataset
- 解析csv文件
- 合并dataset

In [4]:
filename_dataset = tf.data.Dataset.list_files(train_filenames) # list_files是生成文件名的dataset
for filename in filename_dataset:
    print(filename)

tf.Tensor(b'.\\generate_csv\\train_01.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_16.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_00.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_09.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_06.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_05.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_17.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_15.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_07.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_04.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_12.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_03.csv', shape=(), dtype=string)
tf.Tensor(b'.\\generate_csv\\train_08.csv', shap

In [6]:
n_readers = 5 
dataset = filename_dataset.interleave(
    lambda filename : tf.data.TextLineDataset(filename).skip(1), # 不要第一行
    cycle_length = n_readers # 同时读取的文件数目
)
for line in dataset.take(15): # 只读取15个
    print(line.numpy())

b'-0.10747626276770873,-1.0840355345979926,0.12989204022795742,-0.1668859748014119,0.23316772384066173,-0.04095668660096209,-0.8867623414130551,1.3184673846634425,1.128'
b'-0.23298982509632754,-1.0840355345979926,-0.9899744288246921,0.06264406361177728,-0.08647345824305973,0.002382582082986825,-0.9289442903575075,0.8255079469585076,1.825'
b'5.050926250280891,1.0684891794361802,1.4715610386986226,-0.20637501366819685,-0.36120637507534925,-0.005365283042446355,1.025486010735378,-1.3156290046891994,5.00001'
b'-1.2015300182851458,-0.6854198468138866,0.2897638681536601,0.28082489006718947,-0.37705635104644286,-0.08999487244467719,-0.8867623414130551,1.3433643259616774,1.4'
b'1.4257436156065406,1.2279354545498224,0.46399594111632975,-0.3395701268491613,-0.44662013447513155,-0.008433700384836625,-0.7086607792031541,0.7657552878427553,5.00001'
b'-0.427853702315602,-0.3665272965866017,-0.5398090826289635,0.08950194203089089,0.17152892839751985,-0.035610228149806446,-0.9336311735735566,0.8255079

In [8]:
# tf.io.decode_csv(str, record_defaults)字符串， 默认值与类型(tf类型)
sample_str = ' 1, 2, 3, 4, 5'
record_defaults = [tf.constant(0, dtype=tf.int32)] * 5
parsed_fields = tf.io.decode_csv(sample_str, record_defaults=record_defaults)
print(parsed_fields)

[<tf.Tensor: id=137, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=138, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=139, shape=(), dtype=int32, numpy=3>, <tf.Tensor: id=140, shape=(), dtype=int32, numpy=4>, <tf.Tensor: id=141, shape=(), dtype=int32, numpy=5>]


## **2.3 利用`tf.io.decode_csv`读取csv**

In [15]:
def parse_csv_line(line, n_fields = 9):
    # 解析单行数据
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

In [17]:
def read_csv_to_dataset(filenames, n_readers=5, batch_size=32, n_parse_threads=5,
                        shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename : tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads) # 注意interleave(1->n)和map(1->1)的区别
    dataset = dataset.batch(batch_size)
    return dataset

train_set = read_csv_to_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print('x:')
    print(x_batch)
    print('y:')
    print(y_batch)

x:
tf.Tensor(
[[ 0.62138504  0.11181153  0.31693795 -0.33014098 -0.6456254  -0.07007352
  -0.6383575   0.17818747]
 [-0.10794911 -0.60569674 -0.50696576 -0.11578695  1.5953851  -0.08225997
  -0.8633279   0.6263324 ]
 [ 0.90188605 -1.0043124   0.56083    -0.22492805 -0.3647286  -0.01525014
   1.2832646  -1.5596191 ]], shape=(3, 8), dtype=float32)
y:
tf.Tensor(
[[2.343]
 [2.711]
 [2.53 ]], shape=(3, 1), dtype=float32)
x:
tf.Tensor(
[[-0.8278264   0.59015036 -0.3133547  -0.1278954   0.63646156 -0.00710292
   0.7817681  -0.43925667]
 [-1.0335131   1.2279354  -0.8085153   0.03061798  1.546074    0.1289988
  -0.8680148   0.7010232 ]
 [-1.0686085   0.74959666 -0.78302515 -0.1515431  -0.16308168 -0.12044221
   1.3770022  -0.9471543 ]], shape=(3, 8), dtype=float32)
y:
tf.Tensor(
[[0.683]
 [1.475]
 [1.141]], shape=(3, 1), dtype=float32)


In [18]:
train_set = read_csv_to_dataset(train_filenames, batch_size=32)
valid_set = read_csv_to_dataset(valid_filenames, batch_size=32)

## **2.4 使用dataset训练**

In [19]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(128, activation='relu', input_shape=[8]))
model.add(keras.layers.Dense(1, activation='relu'))
model.summary()
model.compile(loss='mean_squared_error', optimizer='Adam')

callbacks = [keras.callbacks.EarlyStopping(min_delta=1e-3, patience=5),]

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 128)               1152      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 129       
Total params: 1,281
Trainable params: 1,281
Non-trainable params: 0
_________________________________________________________________


In [24]:
history = model.fit(train_set, 
                    steps_per_epoch = 11160 // 32, # 再repeat情况下，训练不知道什么是一个epoch， 需要指定steps
                    epochs=20, 
                    validation_data=valid_set,
                    validation_steps=3870 // 32, # 验证集的steps，evaluate时也需要指定steps
                    callbacks=callbacks)

Train for 348 steps, validate for 120 steps
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
