### Dataset CSV API

In [1]:
import tensorflow as tf
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from tensorflow import keras
from tensorflow.python.keras.callbacks import History

In [14]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

x_train_all, x_test, y_train_all, y_test = train_test_split(housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state = 11)

print(x_valid.shape, y_valid.shape)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(3870, 8) (3870,)
(11610, 8) (11610,)
(5160, 8) (5160,)


In [17]:
from sklearn.preprocessing import StandardScaler

# perform normalization
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled  = scaler.transform(x_test)

### Generate csv files by using numpy lib

In [8]:
import os
output_dir = "generated_csv"

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [24]:
def save_to_csv(output_dir, data, name_prefix, 
                header=None, n_parts=10):
    
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n")
            for row_index in row_indices:
                f.write(",".join(
                    [repr(col) for col in data[row_index]]
                ))
                f.write('\n')
    return filenames

In [25]:
# merge two dataset
train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]

header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir, 
                              data = train_data, 
                              name_prefix = "train", 
                              header = header_str,
                              n_parts = 20)

valid_filenames = save_to_csv(output_dir, 
                              data = valid_data, 
                              name_prefix = "valid", 
                              header = header_str,
                              n_parts = 10)

test_filenames = save_to_csv(output_dir, 
                              data = test_data, 
                              name_prefix = "test", 
                              header = header_str,
                              n_parts = 10)

In [28]:
import pprint
pprint.pprint(train_filenames)
pprint.pprint(test_filenames)
pprint.pprint(valid_filenames)

['generated_csv/train_00.csv',
 'generated_csv/train_01.csv',
 'generated_csv/train_02.csv',
 'generated_csv/train_03.csv',
 'generated_csv/train_04.csv',
 'generated_csv/train_05.csv',
 'generated_csv/train_06.csv',
 'generated_csv/train_07.csv',
 'generated_csv/train_08.csv',
 'generated_csv/train_09.csv',
 'generated_csv/train_10.csv',
 'generated_csv/train_11.csv',
 'generated_csv/train_12.csv',
 'generated_csv/train_13.csv',
 'generated_csv/train_14.csv',
 'generated_csv/train_15.csv',
 'generated_csv/train_16.csv',
 'generated_csv/train_17.csv',
 'generated_csv/train_18.csv',
 'generated_csv/train_19.csv']
['generated_csv/test_00.csv',
 'generated_csv/test_01.csv',
 'generated_csv/test_02.csv',
 'generated_csv/test_03.csv',
 'generated_csv/test_04.csv',
 'generated_csv/test_05.csv',
 'generated_csv/test_06.csv',
 'generated_csv/test_07.csv',
 'generated_csv/test_08.csv',
 'generated_csv/test_09.csv']
['generated_csv/valid_00.csv',
 'generated_csv/valid_01.csv',
 'generated_csv/va

### Read csv files with tensorflow API

In [None]:
# 1. read filename to dataset
# 2. read file -> dataset -> datasets -> merge
# 3. parse csv

In [31]:
# 1. read filename to dataset

filename_dataset = tf.data.Dataset.list_files(train_filenames)
for name in filename_dataset:
    print(name)

tf.Tensor(b'generated_csv/train_06.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_12.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_00.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_01.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_11.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_04.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_10.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_15.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_03.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_16.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_14.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_19.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_18.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_17.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv/train_02.csv', shape=(), dtype=string)
tf.Tensor(b'generated_csv

In [32]:
# 2. read file -> dataset -> datasets -> merge

In [34]:
n_readers = 5
#.skip(1) -> remove header
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filenames=filename).skip(1),
    cycle_length = n_readers,
)

for line in dataset.take(15):
    print(line.numpy())

b'-0.32652634129448693,0.43236189741438374,-0.09345459539684739,-0.08402991822890092,0.8460035745154013,-0.0266316482653991,-0.5617679242614233,0.1422875991184281,2.431'
b'0.04971034572063198,-0.8492418886278699,-0.06214699417830008,0.17878747064657746,-0.8025354230744277,0.0005066066922077538,0.6466457006743215,-1.1060793768010604,2.286'
b'0.801544314532886,0.27216142415910205,-0.11624392696666119,-0.2023115137272354,-0.5430515742518128,-0.021039615516440048,-0.5897620622908205,-0.08241845654707416,3.226'
b'0.8115083791797953,-0.04823952235146133,0.5187339067174729,-0.029386394873127775,-0.034064024638222286,-0.05081594842905086,-0.7157356834231196,0.9162751241885168,2.147'
b'2.51504373119231,1.0731637904355105,0.5574401201546321,-0.17273513019187772,-0.612912610473286,-0.01909156503651574,-0.5710993036045546,-0.027490309606616956,5.00001'
b'2.2754266257529974,-1.249743071766074,1.0294788075585177,-0.17124431895714504,-0.45413752815175606,0.10527151658164971,-0.9023632702857819,0.9012

In [38]:
# 3. parse csv
sample_str = '1, 2, 3, 4, 5'
record_defaults = [tf.constant(0, dtype=tf.int32)] * 5
parsed_fields = tf.io.decode_csv(sample_str, record_defaults)

### Use tf.data together with Keras

In [47]:
def parse_csv_line(line, n_fields = 9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, defs)
    x = tf.stack(parsed_fields[0: -1])  # train
    y = tf.stack(parsed_fields[-1:])  # label
    return x, y

### build data preprocessng pipeline

In [49]:
def csv_reader_dataset(filenames, n_readers = 5, batch_size=32,
                       n_parse_threads = 5, shuffle_buffer_size = 10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat() # without number means, it should repeat unlimited times
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers        
    )
    dataset.shuffle(shuffle_buffer_size)
    # map is pretty like interleave but without joint multipole sets to one set
    dataset = dataset.map(parse_csv_line, 
                          num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

In [51]:
train_set = csv_reader_dataset(train_filenames, batch_size = 3)

for x_batch, y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)

x:
<tf.Tensor: id=499, shape=(3, 8), dtype=float32, numpy=
array([[ 0.4240821 ,  0.91296333, -0.04437482, -0.15297213, -0.24727628,
        -0.10539167,  0.86126745, -1.335779  ],
       [ 0.63636464, -1.0895426 ,  0.09260903, -0.20538124,  1.2025671 ,
        -0.03630123, -0.6784102 ,  0.18223535],
       [ 0.8015443 ,  0.27216142, -0.11624393, -0.20231152, -0.5430516 ,
        -0.02103962, -0.5897621 , -0.08241846]], dtype=float32)>
y:
<tf.Tensor: id=500, shape=(3, 1), dtype=float32, numpy=
array([[3.955],
       [2.429],
       [3.226]], dtype=float32)>
x:
<tf.Tensor: id=503, shape=(3, 8), dtype=float32, numpy=
array([[-1.1157656 ,  0.99306357, -0.334192  , -0.06535219, -0.32893205,
         0.04343066, -0.12785879,  0.30707204],
       [-1.119975  , -1.3298433 ,  0.14190045,  0.4658137 , -0.10301778,
        -0.10744184, -0.7950524 ,  1.5304717 ],
       [ 0.4369235 , -1.9706452 , -0.16642106,  0.05486205, -0.8379196 ,
        -0.1323988 , -0.99567705,  0.94124246]], dtype=float32)

In [52]:
batch_size = 32

train_set = csv_reader_dataset(train_filenames, batch_size=batch_size)
test_set = csv_reader_dataset(test_filenames, batch_size=batch_size)
valid_set = csv_reader_dataset(valid_filenames, batch_size=batch_size)

In [55]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(30, activation = 'relu', input_shape=[8]))
model.add(keras.layers.Dense(1))

model.summary()
# mean_squared_error make model as regression
model.compile(loss = "mean_squared_error", optimizer = "sgd", metrics = ["accuracy"])
callbacks = [
    keras.callbacks.EarlyStopping(patience = 5, min_delta = 1e-3)
]

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 30)                270       
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 31        
Total params: 301
Trainable params: 301
Non-trainable params: 0
_________________________________________________________________


In [58]:
history = model.fit(train_set, 
                    validation_data = valid_set,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs = 100, 
                    callbacks = callbacks)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100


In [59]:
model.evaluate(test_set, steps = 5160//batch_size)



[0.34761891962948793, 0.0032996894]