In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import pandas as pd
import sklearn
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras
print(tf.__version__)
print(sys.version_info)
for module in np, pd ,sklearn, tf, keras:
    print(module.__name__,module.__version__)

2.0.0
sys.version_info(major=3, minor=7, micro=6, releaselevel='final', serial=0)
numpy 1.18.1
pandas 1.0.1
sklearn 0.22.2
tensorflow 2.0.0
tensorflow_core.keras 2.2.4-tf


In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

# print(housing.DESCR)
print(housing.data.shape)
print(housing.target.shape)


(20640, 8)
(20640,)


In [3]:
from sklearn.model_selection import train_test_split

x_train_all,x_test,y_train_all,y_test=train_test_split(housing.data, housing.target,random_state=7)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all,y_train_all,random_state=11)

In [4]:
from sklearn.preprocessing import StandardScaler
transfer = StandardScaler()
x_train_scaled = transfer.fit_transform(x_train)
x_test_scaled = transfer.transform(x_test)
x_valid_scaled = transfer.transform(x_valid)

In [15]:
output_dir = 'data/generate_csv'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
def save_to_csv(output_dir, data , name_prefix, header=None,n_parts = 10):
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    for file_idx,row_indices in enumerate(
            np.array_split(np.arange(len(data)),n_parts)):
#         np.split():一旦不均等就会报错：
#         np.array_split:可以不均等划分
        enumerate
        part_csv = path_format.format(name_prefix,file_idx)
        filenames.append(part_csv)
        with open(part_csv,'wt',encoding='utf-8') as f:
            if header is not None:
                f.write(header+'\n')
            for row_index in row_indices:
                f.write(','.join(
                [repr(col) for col in data[row_index]]#repr返回对象的string形式。
                ))
    return filenames
        
    
train_data = np.c_[x_train_scaled, y_train]
test_data = np.c_[x_test_scaled, y_test]
valid_data = np.c_[x_valid_scaled, y_valid]
header_cols = housing.feature_names+["midianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir,train_data,'train',header_str,n_parts=20)
valid_filenames = save_to_csv(output_dir,valid_data,'valid',header_str,n_parts=10)
test_filenames = save_to_csv(output_dir,test_data,'test',header_str,n_parts=10)


# np.r_[] 是按列连接两个矩阵，就是把两矩阵上下相加，要求列数相等。
# np.c_[] 是按行连接两个矩阵，就是把两矩阵左右相加，要求行数相等。



In [7]:
#         np.split():必须均分，一旦不均等就会报错：
#         np.array_split:可以不均等划分
# np.r_[] 是按列连接两个矩阵，就是把两矩阵上下相加，要求列数相等。
# np.c_[] 是按行连接两个矩阵，就是把两矩阵左右相加，要求行数相等。

a = np.arange(10)
print(np.array_split(a,3))#把a分成3份
print(np.split(a,5))#把a平均分成5份

[array([0, 1, 2, 3]), array([4, 5, 6]), array([7, 8, 9])]
[array([0, 1]), array([2, 3]), array([4, 5]), array([6, 7]), array([8, 9])]


In [23]:
# 参数
# sequence -- 一个序列、迭代器或其他支持迭代对象。
# start -- 下标起始位置。
# 返回值
# 返回 enumerate(枚举) 对象。
a = enumerate(np.array_split(np.arange(40),6))
for x,y in a:
    print(x,y)
print('------------------')
a = enumerate(np.array_split(np.arange(40),6), start =1)
for x,y in a:
    print(x,y)
print(type(y))
for a in y:
    print(a)

0 [0 1 2 3 4 5 6]
1 [ 7  8  9 10 11 12 13]
2 [14 15 16 17 18 19 20]
3 [21 22 23 24 25 26 27]
4 [28 29 30 31 32 33]
5 [34 35 36 37 38 39]
------------------
1 [0 1 2 3 4 5 6]
2 [ 7  8  9 10 11 12 13]
3 [14 15 16 17 18 19 20]
4 [21 22 23 24 25 26 27]
5 [28 29 30 31 32 33]
6 [34 35 36 37 38 39]
<class 'numpy.ndarray'>
34
35
36
37
38
39


In [7]:
print(housing.feature_names)

['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']


In [18]:
import pprint
pprint.pprint(train_filenames)
pprint.pprint(valid_filenames)
pprint.pprint(test_filenames)

['data/generate_csv/train_00.csv',
 'data/generate_csv/train_01.csv',
 'data/generate_csv/train_02.csv',
 'data/generate_csv/train_03.csv',
 'data/generate_csv/train_04.csv',
 'data/generate_csv/train_05.csv',
 'data/generate_csv/train_06.csv',
 'data/generate_csv/train_07.csv',
 'data/generate_csv/train_08.csv',
 'data/generate_csv/train_09.csv',
 'data/generate_csv/train_10.csv',
 'data/generate_csv/train_11.csv',
 'data/generate_csv/train_12.csv',
 'data/generate_csv/train_13.csv',
 'data/generate_csv/train_14.csv',
 'data/generate_csv/train_15.csv',
 'data/generate_csv/train_16.csv',
 'data/generate_csv/train_17.csv',
 'data/generate_csv/train_18.csv',
 'data/generate_csv/train_19.csv']
['data/generate_csv/valid_00.csv',
 'data/generate_csv/valid_01.csv',
 'data/generate_csv/valid_02.csv',
 'data/generate_csv/valid_03.csv',
 'data/generate_csv/valid_04.csv',
 'data/generate_csv/valid_05.csv',
 'data/generate_csv/valid_06.csv',
 'data/generate_csv/valid_07.csv',
 'data/generate_csv/

In [20]:
# filename -> dataset
# read file -> dataset -> datasets -> merge
filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)

tf.Tensor(b'data/generate_csv/train_09.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_03.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_17.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_02.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_18.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_00.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_08.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_04.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_10.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_14.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_16.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_13.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_05.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/train_15.csv', shape=(), dtype=string)
tf.Tensor(b'data/generate_csv/trai

In [23]:
n_readers = 5
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1),#跳一行，省略header
    cycle_length= n_readers
)
for line in dataset.take(15):
    print(line.numpy)


<bound method _EagerTensorBase.numpy of <tf.Tensor: id=191, shape=(), dtype=string, numpy=b'-1.1199749330438333,-1.329843308393715,0.1419004518620726,0.4658136987980791,-0.10301777467500105,-0.10744184416176107,-0.7950524078397521,1.5304716763409,0.66-0.9490938885377456,0.6726626072973063,0.28370554761513944,0.10655529643465292,-0.6546477749692311,-0.0623949278698749,0.21273656121863005,0.0024704978154519064,0.6073.8743126570888804,-0.8492418886278699,1.2254810098923188,-0.023587924660354292,0.10202890306594632,0.03335714649304235,-1.2289615472954436,1.1709419872760878,5.000010.9983212703673051,0.9129633171802288,0.27784564068779194,-0.38538378612280755,-0.8887276106204011,-0.045322746804599315,-0.6830758557221533,0.5767193067384124,3.824-0.5076378692883107,1.7139656834566372,-0.12830896125699554,0.09230174287983997,0.7552749560459556,0.045815626761734035,0.9732439776839799,-1.4206678547327694,2.464-0.6844334120607457,0.43236189741438374,-0.9175211885652003,-0.1760767385762911,0.563837

In [25]:
# rf.io.decode_csv(str, record_defaults)
sample_str = '1,2,3,4,5'
record_defaults = [tf.constant(0,dtype=tf.int32)] *5
parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
print(parsed_fields)

[<tf.Tensor: id=209, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=210, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=211, shape=(), dtype=int32, numpy=3>, <tf.Tensor: id=212, shape=(), dtype=int32, numpy=4>, <tf.Tensor: id=213, shape=(), dtype=int32, numpy=5>]


In [26]:
# rf.io.decode_csv(str, record_defaults)
sample_str = '1,2,3,4,5'
record_defaults = [tf.constant(0,dtype=tf.int32),
                           0,
                           np.nan,
                           'hello',
                           tf.constant([])
                  ]
                            
parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
print(parsed_fields)

[<tf.Tensor: id=220, shape=(), dtype=int32, numpy=1>, <tf.Tensor: id=221, shape=(), dtype=int32, numpy=2>, <tf.Tensor: id=222, shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: id=223, shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: id=224, shape=(), dtype=float32, numpy=5.0>]


In [31]:
try:
    parsed_field = tf.io.decode_csv(',,,', record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Expect 5 fields but have 4 in record 0 [Op:DecodeCSV]


In [32]:
try:
    parsed_field = tf.io.decode_csv('1,2,3,4,5,5,6', record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]
