In [1]:
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import sklearn
import pandas as pd
import os
import sys
import time
import tensorflow as tf

from tensorflow import keras

print(tf.__version__)
print(sys.version_info)
for module in mpl, np, pd, sklearn, tf, keras:
    print(module.__name__, module.__version__)

2.5.0
sys.version_info(major=3, minor=6, micro=9, releaselevel='final', serial=0)
matplotlib 3.3.3
numpy 1.19.4
pandas 1.1.4
sklearn 0.23.2
tensorflow 2.5.0
tensorflow.keras 2.5.0


In [2]:
from sklearn.datasets import fetch_california_housing

housing = fetch_california_housing()

In [3]:
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(
    housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(
    x_train_all, y_train_all, random_state = 7)
print(x_train.shape, y_train.shape)
print(x_valid.shape, y_valid.shape)
print(x_test.shape, y_test.shape)

(11610, 8) (11610,)
(3870, 8) (3870,)
(5160, 8) (5160,)


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.fit_transform(x_valid)
x_test_scaled = scaler.fit_transform(x_test)

In [5]:
output_dir = 'generate_csv'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)
    
def save_2_csv(output_dir, data, name_prefix, header=None, n_parts=10):
    path_format = os.path.join(output_dir, '{}__{:02d}.csv')
    filenames = []
    
    for file_idx, row_indices in enumerate(
        np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        with open(part_csv, 'wt', encoding='utf-8') as f:
            if header:
                f.write(header+'\n')
            for row_indice in row_indices:
                f.write(','.join([repr(col) for col in data[row_indice]]))
                f.write('\n')
    
    return filenames
    

train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]
header_cols = housing.feature_names + ['MidianHouseValue']
header_str = ','.join(header_cols)

train_filenames = save_2_csv(output_dir, train_data, 'train', header_str, n_parts=20)
valid_filenames = save_2_csv(output_dir, valid_data, 'valid', header_str, n_parts=10)
test_filenames = save_2_csv(output_dir, test_data, 'test', header_str, n_parts=10)

In [6]:
import pprint
print('train filenames')
pprint.pprint(train_filenames)
print('valid filenames')
pprint.pprint(valid_filenames)
print('test filenames')
pprint.pprint(test_filenames)

train filenames
['generate_csv/train__00.csv',
 'generate_csv/train__01.csv',
 'generate_csv/train__02.csv',
 'generate_csv/train__03.csv',
 'generate_csv/train__04.csv',
 'generate_csv/train__05.csv',
 'generate_csv/train__06.csv',
 'generate_csv/train__07.csv',
 'generate_csv/train__08.csv',
 'generate_csv/train__09.csv',
 'generate_csv/train__10.csv',
 'generate_csv/train__11.csv',
 'generate_csv/train__12.csv',
 'generate_csv/train__13.csv',
 'generate_csv/train__14.csv',
 'generate_csv/train__15.csv',
 'generate_csv/train__16.csv',
 'generate_csv/train__17.csv',
 'generate_csv/train__18.csv',
 'generate_csv/train__19.csv']
valid filenames
['generate_csv/valid__00.csv',
 'generate_csv/valid__01.csv',
 'generate_csv/valid__02.csv',
 'generate_csv/valid__03.csv',
 'generate_csv/valid__04.csv',
 'generate_csv/valid__05.csv',
 'generate_csv/valid__06.csv',
 'generate_csv/valid__07.csv',
 'generate_csv/valid__08.csv',
 'generate_csv/valid__09.csv']
test filenames
['generate_csv/test__00

In [7]:
# 1.filename -> dataset
# 2.read file -> dataset -> datasets -> merge
# 3.parse csv

filename_dataset = tf.data.Dataset.list_files(train_filenames)
for filename in filename_dataset:
    print(filename)

tf.Tensor(b'generate_csv/train__03.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__12.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__11.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__17.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__00.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__14.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__16.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__09.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__01.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__06.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__15.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__02.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__07.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__04.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/train__08.csv', shape=(), dtype=string)
tf.Tensor(b'generate_csv/

In [8]:
n_readers = 5
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filename).skip(1),
    cycle_length = n_readers
)
for line in dataset.take(15):
    print(line.numpy())

b'-0.8326912290523859,-0.21322024649939578,0.19590484389231774,0.28256634342161385,0.30623724325012375,-0.06698661284884411,-0.1770926678271232,-0.5473521892916272,2.47'
b'0.5765711979456258,0.5845005133156296,-0.04645721372063711,-0.24013502085107374,-0.5646393649122108,-0.020557273966137928,-0.6545863987313087,0.5646945919829217,2.171'
b'-1.616784570539642,-0.4525364744439034,-0.029148908629641944,0.08771590095596313,-0.3178629179184352,0.023730595404004955,1.0072790372587457,-1.3452332879639521,1.125'
b'-0.34681893647975504,-0.6120806264069084,-0.2993192081256999,0.04803324388614384,0.04851167062181197,-0.08686994126075745,-0.8184322867866667,0.7641648666509977,1.807'
b'-0.45521966900167576,-0.2929923224808983,-0.3987316967655819,0.048342385355644925,0.8612736888777626,-0.03406098870043033,-0.6873555763423803,0.5996018900498323,2.042'
b'2.514875755358087,-0.6120806264069084,0.9274771622665903,-0.18491981184094955,-0.09130024132688004,0.06473156953538023,-0.8605640865723282,0.7691516

In [9]:
# tf.io.decode_csv(str, record_defaults)

sample_str = '1,2,3,4,5'
record_deafults = [tf.constant(0, dtype=tf.int32)] * 5
parsed_fields = tf.io.decode_csv(sample_str, record_deafults)
print(parsed_fields)

[<tf.Tensor: shape=(), dtype=int32, numpy=1>, <tf.Tensor: shape=(), dtype=int32, numpy=2>, <tf.Tensor: shape=(), dtype=int32, numpy=3>, <tf.Tensor: shape=(), dtype=int32, numpy=4>, <tf.Tensor: shape=(), dtype=int32, numpy=5>]


In [10]:
# tf.io.decode_csv(str, record_defaults)

sample_str = '1,2,3,4,5'
record_defaults = [tf.constant(0, dtype=tf.int32),
                  0,
                  np.nan,
                  'hello',
                  tf.constant([])]
parsed_fields = tf.io.decode_csv(sample_str, record_defaults)
print(parsed_fields)

[<tf.Tensor: shape=(), dtype=int32, numpy=1>, <tf.Tensor: shape=(), dtype=int32, numpy=2>, <tf.Tensor: shape=(), dtype=float32, numpy=3.0>, <tf.Tensor: shape=(), dtype=string, numpy=b'4'>, <tf.Tensor: shape=(), dtype=float32, numpy=5.0>]


In [11]:
try:
    parsed_fields = tf.io.decode_csv(',,,,', record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Field 4 is required but missing in record 0! [Op:DecodeCSV]


In [12]:
try:
    parsed_fields = tf.io.decode_csv('1,2,3,4,5,6,7', record_defaults)
except tf.errors.InvalidArgumentError as ex:
    print(ex)

Expect 5 fields but have 7 in record 0 [Op:DecodeCSV]


In [13]:
def parse_csv_line(line, n_fields=9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(parsed_fields[0:-1])
    y = tf.stack(parsed_fields[-1:])
    return x, y

parse_csv_line(b'-0.33025991779778285,0.9035888172416396,-0.12848270939697026,-0.028909780108422635,-0.5393721519094352,0.005267396738674212,-0.7856631091755951,0.6345091881167501,1.043',
              n_fields=9)

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([-0.33025992,  0.90358883, -0.12848271, -0.02890978, -0.53937215,
         0.0052674 , -0.7856631 ,  0.6345092 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([1.043], dtype=float32)>)

In [14]:
# 1.filename -> dataset
# 2.read file -> dataset -> datasets -> merge
# 3.parse csv


def csv_reader_dataset(filenames, n_readers=5, batch_size=32, n_parse_threads=5, shuffle_buffer_size=10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat()
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers
    )
    dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(parse_csv_line, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

train_set = csv_reader_dataset(train_filenames, batch_size=3)
for x_batch, y_batch in train_set.take(2):
    print('x:')
    pprint.pprint(x_batch)
    print('y:')
    pprint.pprint(y_batch)

x:
<tf.Tensor: shape=(3, 8), dtype=float32, numpy=
array([[-1.2510048 ,  1.3024492 , -1.3111452 ,  0.00781415, -0.21847855,
         0.08010878, -0.74821264,  0.65944296],
       [-0.72751766,  1.142905  , -0.4262565 , -0.07966067,  0.06619872,
        -0.10571232, -0.7013995 ,  0.61456215],
       [-0.7766657 , -0.13344817, -0.06437794, -0.14419238, -0.64970565,
        -0.01160897,  0.9042902 , -0.5473522 ]], dtype=float32)>
y:
<tf.Tensor: shape=(3, 1), dtype=float32, numpy=
array([[1.875],
       [2.743],
       [0.818]], dtype=float32)>
x:
<tf.Tensor: shape=(3, 8), dtype=float32, numpy=
array([[-0.83269125, -0.21322025,  0.19590485,  0.28256634,  0.30623725,
        -0.06698661, -0.17709267, -0.5473522 ],
       [-1.0163747 ,  1.8608537 , -0.8985311 , -0.05306474, -0.8097313 ,
         0.23997213, -0.76693785,  0.6544562 ],
       [ 1.4685363 ,  1.8608537 ,  0.05410383, -0.4459113 , -0.6581281 ,
         0.03778752,  0.78725743, -1.1357895 ]], dtype=float32)>
y:
<tf.Tensor: shape=(

In [15]:
batch_size = 32
train_set = csv_reader_dataset(train_filenames, batch_size=batch_size)
valid_set = csv_reader_dataset(train_filenames, batch_size=batch_size)
test_set = csv_reader_dataset(train_filenames, batch_size=batch_size)


In [19]:
model = keras.models.Sequential([
    keras.layers.Dense(30, activation='relu',input_shape=[8]),
    keras.layers.Dense(1),
]) 
model.compile(loss='mean_squared_error', optimizer = 'sgd')
callbacks = [keras.callbacks.EarlyStopping(patience=5, min_delta=1e-2)]

history = model.fit(train_set, 
                    validation_data = valid_set, 
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs=100, 
                    callbacks=callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100


In [20]:
model.evaluate(test_set, steps=5160 // batch_size)



0.3488520383834839