In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

In [2]:
# 1.获取文件名
def get_filenames_by_prefix(source_file, prefix):
    all_files = os.listdir(source_file)
    result = []
    for file in all_files:
        if file.startswith(prefix):
            result.append(os.path.join(source_file, file))
    return result
            
source_file = "generate_csv"
train_name = get_filenames_by_prefix(source_file, "train")
valid_name = get_filenames_by_prefix(source_file, "valid")
test_name = get_filenames_by_prefix(source_file, "test")

import pprint
pprint.pprint(train_name)
pprint.pprint(valid_name)
pprint.pprint(test_name)

['generate_csv\\train_00.csv',
 'generate_csv\\train_01.csv',
 'generate_csv\\train_02.csv',
 'generate_csv\\train_03.csv',
 'generate_csv\\train_04.csv',
 'generate_csv\\train_05.csv',
 'generate_csv\\train_06.csv',
 'generate_csv\\train_07.csv',
 'generate_csv\\train_08.csv',
 'generate_csv\\train_09.csv',
 'generate_csv\\train_10.csv',
 'generate_csv\\train_11.csv',
 'generate_csv\\train_12.csv',
 'generate_csv\\train_13.csv',
 'generate_csv\\train_14.csv',
 'generate_csv\\train_15.csv',
 'generate_csv\\train_16.csv',
 'generate_csv\\train_17.csv',
 'generate_csv\\train_18.csv',
 'generate_csv\\train_19.csv']
['generate_csv\\valid_00.csv',
 'generate_csv\\valid_01.csv',
 'generate_csv\\valid_02.csv',
 'generate_csv\\valid_03.csv',
 'generate_csv\\valid_04.csv',
 'generate_csv\\valid_05.csv',
 'generate_csv\\valid_06.csv',
 'generate_csv\\valid_07.csv',
 'generate_csv\\valid_08.csv',
 'generate_csv\\valid_09.csv']
['generate_csv\\test_00.csv',
 'generate_csv\\test_01.csv',
 'generate

In [3]:
# 2.解析csv数据
def parse_to_csv(records, colums=9):
    default_records = [tf.constant(1.0)] * colums
    data = tf.io.decode_csv(records, default_records)
    x = tf.stack(data[0:-1])
    y = tf.stack(data[-1])
    return x, y

In [4]:
# 3.csv转dataset
def csv_to_dataset(filenames, batchsize):
    # a.读入文件名
    dataset = tf.data.Dataset.list_files(filenames)
    
    # b.读入文件中的数据  TextLineDataset
    dataset = dataset.interleave(lambda file: tf.data.TextLineDataset(file).skip(1), cycle_length = 5)
    
    # c.对文件中的数据转换
    dataset = dataset.map(parse_to_csv)
    
    # d.生成batch_size的文件
    dataset = dataset.batch(batchsize)
    
    return dataset

In [5]:
train_set = csv_to_dataset(train_name, 32)
valid_set = csv_to_dataset(valid_name, 32)
test_set = csv_to_dataset(test_name, 32)

In [6]:
# 4.序列化
def serialize_example(x, y):
    # a.生成tf.train.Feature(bytes_list, float_list, int64_list)
    input_features = tf.train.FloatList(value = x)
    label = tf.train.FloatList(value = y)
    
    # b.生成tf.train.Features(feature = {"key": XXXX})
    features = tf.train.Features(feature={
        "input_features": tf.train.Feature(float_list = input_features),
        "label": tf.train.Feature(float_list = label)
    })
    
    # c.生成tf.train.Example
    example = tf.train.Example(features = features)
    
    # d.example的序列化
    serialize_example = example.SerializeToString()
    
    return serialize_example

In [7]:
# 5.csv转tfrecord，并写到文件中
# nshapes：存多少个文件，steps_per_shard：遍历多少步
def csv_dataset_to_tfrecords(base_filename, dataset, nshapes, steps_per_shard, compression_type=None):
    options = tf.io.TFRecordOptions(compression_type=compression_type)
    result = []
    for shape_id in range(nshapes):
        
        fullfile_name = "{}_{:05d}-of-{:05d}".format(base_filename, shape_id, nshapes)
        
        with tf.io.TFRecordWriter(path = fullfile_name, options=options) as writer:
            
            for x_batch, y_batch in dataset.take(steps_per_shard):
                for x, y in zip(x_batch, y_batch):
                    writer.write(serialize_example(x.numpy(), [y.numpy()]))
                    
        result.append(fullfile_name)
        
    return result

In [8]:
# 6.生成tfrecords文件
base_filename = "generate_tfrecord"
batch_size = 32
nshapes = 20

train_steps_per_shard = 11610 // batch_size // nshapes
valid_steps_per_shard = 3880 // batch_size // nshapes
test_steps_per_shard = 5170 // batch_size // nshapes

if not os.path.exists(base_filename):
    os.makedirs(base_filename)
    
train_base_filename = os.path.join(base_filename, "train")
valid_base_filename = os.path.join(base_filename, "valid")
test_base_filename = os.path.join(base_filename, "test")

train_tfrecord_filenames = csv_dataset_to_tfrecords(train_base_filename, train_set, nshapes, train_steps_per_shard)
valid_tfrecord_filenames = csv_dataset_to_tfrecords(valid_base_filename, valid_set, nshapes, valid_steps_per_shard)
test_tfrecord_filenames = csv_dataset_to_tfrecords(test_base_filename, test_set, nshapes, test_steps_per_shard)

In [9]:
pprint.pprint(train_tfrecord_filenames)
pprint.pprint(valid_tfrecord_filenames)
pprint.pprint(test_tfrecord_filenames)

['generate_tfrecord\\train_00000-of-00020',
 'generate_tfrecord\\train_00001-of-00020',
 'generate_tfrecord\\train_00002-of-00020',
 'generate_tfrecord\\train_00003-of-00020',
 'generate_tfrecord\\train_00004-of-00020',
 'generate_tfrecord\\train_00005-of-00020',
 'generate_tfrecord\\train_00006-of-00020',
 'generate_tfrecord\\train_00007-of-00020',
 'generate_tfrecord\\train_00008-of-00020',
 'generate_tfrecord\\train_00009-of-00020',
 'generate_tfrecord\\train_00010-of-00020',
 'generate_tfrecord\\train_00011-of-00020',
 'generate_tfrecord\\train_00012-of-00020',
 'generate_tfrecord\\train_00013-of-00020',
 'generate_tfrecord\\train_00014-of-00020',
 'generate_tfrecord\\train_00015-of-00020',
 'generate_tfrecord\\train_00016-of-00020',
 'generate_tfrecord\\train_00017-of-00020',
 'generate_tfrecord\\train_00018-of-00020',
 'generate_tfrecord\\train_00019-of-00020']
['generate_tfrecord\\valid_00000-of-00020',
 'generate_tfrecord\\valid_00001-of-00020',
 'generate_tfrecord\\valid_00002

In [12]:
# 7.读取tfrecoreds内容
except_feature = {
    "input_features": tf.io.FixedLenFeature([8], dtype=tf.float32),
    "label": tf.io.FixedLenFeature([1], dtype=tf.float32),
}

def parse_example(example):
    example = tf.io.parse_single_example(example, except_feature)
    features = example["input_features"]
    label = example["label"]
    
    return features, label

def tfrecords_to_dataset(filenames, batchsize):
    # a.读入文件名
    dataset = tf.data.Dataset.list_files(filenames)
    
    # b.读入文件中的数据  TFRecordDataset
    dataset = dataset.interleave(lambda file: tf.data.TFRecordDataset(file), cycle_length = 5)
    
    # c.对文件中的数据转换
    dataset = dataset.map(parse_example)
    
    # d.生成batch_size的文件
    dataset = dataset.batch(batchsize)
    
    return dataset

In [13]:
tfrecords_train_dataset = tfrecords_to_dataset(train_tfrecord_filenames, 32)
tfrecords_valid_dataset = tfrecords_to_dataset(valid_tfrecord_filenames, 32)
tfrecords_test_dataset = tfrecords_to_dataset(test_tfrecord_filenames, 32)

In [14]:
# 8.训练模型
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu', input_shape=[8]),
    keras.layers.Dense(1),
])

model.compile(loss="mean_squared_error", optimizer=tf.keras.optimizers.SGD(0.001))
callbacks = [keras.callbacks.EarlyStopping(patience=5, min_delta=1e-2)]

batch_size= 32
history = model.fit(tfrecords_train_dataset.repeat(),   # 加上repeat
                    validation_data = tfrecords_valid_dataset,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs = 100,
                    callbacks = callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100


In [15]:
# 9.评价模型
model.evaluate(tfrecords_test_dataset.repeat(), steps = 5160 // batch_size)



0.3894062340259552