### Dataset CSV API

In [None]:
import tensorflow as tf
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from tensorflow import keras
from tensorflow.python.keras.callbacks import History

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
housing = fetch_california_housing()
x_train_all, x_test, y_train_all, y_test = train_test_split(housing.data, housing.target, random_state = 7)
x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, random_state = 11)

print(x_valid.shape, y_valid.shape)
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

In [None]:
from sklearn.preprocessing import StandardScaler

# perform normalization
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_valid_scaled = scaler.transform(x_valid)
x_test_scaled  = scaler.transform(x_test)

### Generate csv files by using numpy lib

In [None]:
import os
output_dir = "generated_csv"

if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [None]:
def save_to_csv(output_dir, data, name_prefix, 
                header=None, n_parts=10):
    
    path_format = os.path.join(output_dir, "{}_{:02d}.csv")
    filenames = []
    for file_idx, row_indices in enumerate(np.array_split(np.arange(len(data)), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filenames.append(part_csv)
        
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header + "\n")
            for row_index in row_indices:
                f.write(",".join(
                    [repr(col) for col in data[row_index]]
                ))
                f.write('\n')
    return filenames

In [None]:
# merge two dataset
train_data = np.c_[x_train_scaled, y_train]
valid_data = np.c_[x_valid_scaled, y_valid]
test_data = np.c_[x_test_scaled, y_test]

header_cols = housing.feature_names + ["MidianHouseValue"]
header_str = ",".join(header_cols)

train_filenames = save_to_csv(output_dir, 
                              data = train_data, 
                              name_prefix = "train", 
                              header = header_str,
                              n_parts = 20)

valid_filenames = save_to_csv(output_dir, 
                              data = valid_data, 
                              name_prefix = "valid", 
                              header = header_str,
                              n_parts = 10)

test_filenames = save_to_csv(output_dir, 
                              data = test_data, 
                              name_prefix = "test", 
                              header = header_str,
                              n_parts = 10)

In [None]:
import pprint
pprint.pprint(train_filenames)
pprint.pprint(test_filenames)
pprint.pprint(valid_filenames)

### Read csv files with tensorflow API

In [None]:
# 1. read filename to dataset
# 2. read file -> dataset -> datasets -> merge
# 3. parse csv

In [None]:
# 1. read filename to dataset

filename_dataset = tf.data.Dataset.list_files(train_filenames)
for name in filename_dataset:
    print(name)

In [None]:
# 2. read file -> dataset -> datasets -> merge

In [None]:
n_readers = 5
#.skip(1) -> remove header
dataset = filename_dataset.interleave(
    lambda filename: tf.data.TextLineDataset(filenames=filename).skip(1),
    cycle_length = n_readers,
)

for line in dataset.take(15):
    print(line.numpy())

In [None]:
# 3. parse csv
sample_str = '1, 2, 3, 4, 5'
record_defaults = [tf.constant(0, dtype=tf.int32)] * 5
parsed_fields = tf.io.decode_csv(sample_str, record_defaults)

### Use tf.data together with Keras

In [None]:
def parse_csv_line(line, n_fields = 9):
    defs = [tf.constant(np.nan)] * n_fields
    parsed_fields = tf.io.decode_csv(line, defs)
    x = tf.stack(parsed_fields[0: -1])  # train
    y = tf.stack(parsed_fields[-1:])  # label
    return x, y

### build data preprocessng pipeline

In [None]:
def csv_reader_dataset(filenames, n_readers = 5, batch_size=32,
                       n_parse_threads = 5, shuffle_buffer_size = 10000):
    dataset = tf.data.Dataset.list_files(filenames)
    dataset = dataset.repeat() # without number means, it should repeat unlimited times
    dataset = dataset.interleave(
        lambda filename: tf.data.TextLineDataset(filename).skip(1),
        cycle_length = n_readers        
    )
    dataset.shuffle(shuffle_buffer_size)
    # map is pretty like interleave but without joint multipole sets to one set
    dataset = dataset.map(parse_csv_line, 
                          num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset

In [None]:
train_set = csv_reader_dataset(train_filenames, batch_size = 3)

for x_batch, y_batch in train_set.take(2):
    print("x:")
    pprint.pprint(x_batch)
    print("y:")
    pprint.pprint(y_batch)

In [None]:
batch_size = 32

train_set = csv_reader_dataset(train_filenames, batch_size=batch_size)
test_set = csv_reader_dataset(test_filenames, batch_size=batch_size)
valid_set = csv_reader_dataset(valid_filenames, batch_size=batch_size)

In [None]:
model = keras.models.Sequential()
model.add(keras.layers.Dense(30, activation = 'relu', input_shape=[8]))
model.add(keras.layers.Dense(1))

model.summary()
# mean_squared_error make model as regression
model.compile(loss = "mean_squared_error", optimizer = "sgd", metrics = ["accuracy"])
#callbacks = [
#    keras.callbacks.EarlyStopping(patience = 5, min_delta = 1e-3)
#]


logdir = './graph_def_and_weights'
if not os.path.exists(logdir):
    os.mkdir(logdir)
    
output_model_file = os.path.join(logdir,
                                 "example_model.h5")

callbacks = [
    keras.callbacks.TensorBoard(logdir),
    keras.callbacks.ModelCheckpoint(output_model_file,
                                    save_best_only = True,
                                    save_weights_only = False),
    keras.callbacks.EarlyStopping(patience=5, min_delta=1e-3),
]

In [None]:
history = model.fit(train_set, 
                    validation_data = valid_set,
                    steps_per_epoch = 11160 // batch_size,
                    validation_steps = 3870 // batch_size,
                    epochs = 10, 
                    callbacks = callbacks)

In [None]:
model.evaluate(test_set, steps = 5160//batch_size)

In [None]:
del model

In [None]:
loaded_model = keras.models.load_model(output_model_file)

### convert tflite model in common way

In [None]:
keras_to_tflite_converter = tf.lite.TFLiteConverter.from_keras_model(loaded_model)
keras_tflite = keras_to_tflite_converter.convert()

In [None]:
if not os.path.exists('./tflite_models'):
    os.mkdir('./tflite_models')
with open('./tflite_models/keras_tflite', 'wb') as f:
    f.write(keras_tflite)

In [None]:
with open('./tflite_models/keras_tflite', 'rb') as f:
    concrete_func_tflite = f.read()
    
interpreter = tf.lite.Interpreter(model_content = concrete_func_tflite)
interpreter.allocate_tensors()

In [None]:
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print(input_details)
print(output_details)

In [None]:
input_data = tf.constant(valid_data[1][0:8].reshape([1,8]), dtype=np.float32)
interpreter.set_tensor(input_details[0]['index'], input_data)
interpreter.invoke()

In [None]:
output_results = interpreter.get_tensor(output_details[0]['index'])
print(output_results)

### convert model as quantized tflite model
* need to conert to a quantized concrete function
* set optimization for model
* model inference...

In [None]:
keras_to_tflite_converter = tf.lite.TFLiteConverter.from_keras_model(loaded_model)
keras_to_tflite_converter.optimizations = [tf.lite.Optimize.OPTIMIZE_FOR_SIZE]
keras_tflite = keras_to_tflite_converter.convert()

In [None]:
if not os.path.exists('./tflite_models'):
    os.mkdir('./tflite_models')
with open('./tflite_models/quantized_keras_tflite', 'wb') as f:
    f.write(keras_tflite)

In [None]:
with open('./tflite_models/quantized_keras_tflite', 'rb') as f:
    concrete_func_tflite = f.read()
    
interpreter = tf.lite.Interpreter(
    model_content = concrete_func_tflite)
interpreter.allocate_tensors()

In [None]:
input_details = interpreter.get_input_details()
output_details = interpreter.get_output_details()

print(input_details)
print(output_details)

In [None]:
input_data = tf.constant(valid_data[1][0:8].reshape([1,8]), dtype=np.float32)
interpreter.set_tensor(input_details[0]['index'], input_data)
interpreter.invoke()

In [None]:
output_results = interpreter.get_tensor(output_details[0]['index'])
print(output_results)