In [2]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

  return f(*args, **kwds)


2.3.1
sys.version_info(major=3, minor=6, micro=12, releaselevel='final', serial=0)
matplotlib 3.3.2
numpy 1.18.5
pandas 1.1.3
sklearn 0.21.2
tensorflow 2.3.1
tensorflow.keras 2.4.0


In [3]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

In [5]:
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 7)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [8]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.fit_transform(x_valid)
x_test_scaled = scaler.fit_transform(x_test)

In [15]:
output_dir = 'generate_csv'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
def save_2_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    path_format = os.path.join(output_dir, '{}__{:02d}.csv')
    filenames = []
    
    for file_idx, row_indices in enumerate(
        np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, 'wt', encoding='utf-8') as f:
            if header:
                f.write(header+'\n')
            for row_indice in row_indices:
                f.write(','.join([repr(col) for col in data[row_indice]]))
                f.write('\n')
    
    return filenames
    

train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]
header_cols = housing.feature_names + ['MidianHouseValue']
header_str = ','.join(header_cols)

train_filenames = save_2_csv(output_dir, train_data, 'train', header_str, n_parts=20)
valid_filenames = save_2_csv(output_dir, valid_data, 'valid', header_str, n_parts=10)
test_filenames = save_2_csv(output_dir, test_data, 'test', header_str, n_parts=10)

In [17]:
import pprint
print('train filenames')
pprint.pprint(train_filenames)
print('valid filenames')
pprint.pprint(valid_filenames)
print('test filenames')
pprint.pprint(test_filenames)

train filenames
['generate_csv/train__00.csv',
 'generate_csv/train__01.csv',
 'generate_csv/train__02.csv',
 'generate_csv/train__03.csv',
 'generate_csv/train__04.csv',
 'generate_csv/train__05.csv',
 'generate_csv/train__06.csv',
 'generate_csv/train__07.csv',
 'generate_csv/train__08.csv',
 'generate_csv/train__09.csv',
 'generate_csv/train__10.csv',
 'generate_csv/train__11.csv',
 'generate_csv/train__12.csv',
 'generate_csv/train__13.csv',
 'generate_csv/train__14.csv',
 'generate_csv/train__15.csv',
 'generate_csv/train__16.csv',
 'generate_csv/train__17.csv',
 'generate_csv/train__18.csv',
 'generate_csv/train__19.csv']
valid filenames
['generate_csv/valid__00.csv',
 'generate_csv/valid__01.csv',
 'generate_csv/valid__02.csv',
 'generate_csv/valid__03.csv',
 'generate_csv/valid__04.csv',
 'generate_csv/valid__05.csv',
 'generate_csv/valid__06.csv',
 'generate_csv/valid__07.csv',
 'generate_csv/valid__08.csv',
 'generate_csv/valid__09.csv']
test filenames
['generate_csv/test__00

In [18]:
# 1.filename -> dataset
# 2.read file -> dataset -> datasets -> merge

filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)

tf.Tensor(b'generate_csv/train__00.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__19.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__13.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__10.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__05.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__18.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/

In [20]:
n_readers = 5
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1),
    cycle_length = n_readers
)
for line in dataset.take(15):
    print(line.numpy())

b'-1.10414274616804,0.026095981445111813,-0.4697282482241662,-0.04522296329387594,1.278182703423561,0.15140218773160447,-0.1958179121763065,0.3402905329813301,0.455'
b'1.155819426536407,-0.13344817051789326,0.6085721468637671,0.04220941749296647,1.167006966211348,0.04454635296762321,-0.7388499983026352,0.9087808157853589,2.148'
b'0.24184624203240349,1.6215375010751625,0.009664811030936506,-0.05126299611752959,0.0030306872168157756,-0.12848071177796017,-0.7575752426518185,0.5547210782495127,5.00001'
b'-1.0163746567322829,1.86085372901967,-0.8985310852575602,-0.053064742407918725,-0.8097313310391347,0.2399721267795869,-0.7669378648264085,0.6544562155835542,0.388'
b'-0.5153188550167574,1.6215375010751625,-0.5674383914596058,-0.09192162409389779,-0.6101203483172071,-0.1362930022629114,1.0353669037825188,-1.335259774230543,2.113'
b'-1.4627275021629542,-0.691852702388411,-1.5381215558019357,-0.10629815553305554,-1.1045154827381842,-0.17304110653435698,0.8013013494177228,-1.1507497701625737,1