<a href="https://colab.research.google.com/github/ybii22/Machine-Learning/blob/main/week%208%20-%2016/Chapter_13_%E2%80%93_Loading_and_Preprocessing_Data_with_TensorFlow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# The Data API

In [None]:
import tensorflow as tf

X = tf.range(10) # any data tensor
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [None]:
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


# Chaining Transformations

In [None]:
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [None]:
dataset = dataset.map(lambda x: x * 2) # Items: [0,2,4,6,8,10,12]

In [None]:
dataset = dataset.apply(tf.data.experimental.unbatch())

Instructions for updating:
Use `tf.data.Dataset.unbatch()`.


In [None]:
for item in dataset.take(3):
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


# Shuffling the Data

In [None]:
dataset = tf.data.Dataset.range(10).repeat(3) # 0 to 9, three times
dataset = dataset.shuffle(buffer_size=5, seed=42).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 2 3 6 7 9 4], shape=(7,), dtype=int64)
tf.Tensor([5 0 1 1 8 6 5], shape=(7,), dtype=int64)
tf.Tensor([4 8 7 1 2 3 0], shape=(7,), dtype=int64)
tf.Tensor([5 4 2 7 8 9 9], shape=(7,), dtype=int64)
tf.Tensor([3 6], shape=(2,), dtype=int64)


### Interleaving lines from multiple files

In [None]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

In [None]:
import numpy as np
from pathlib import Path

def save_to_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = Path() / "datasets" / "housing"
    housing_dir.mkdir(parents=True, exist_ok=True)
    filename_format = "my_{}_{:02d}.csv"

    filepaths = []
    m = len(data)
    chunks = np.array_split(np.arange(m), n_parts)
    for file_idx, row_indices in enumerate(chunks):
        part_csv = housing_dir / filename_format.format(name_prefix, file_idx)
        filepaths.append(str(part_csv))
        with open(part_csv, "w") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([str(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

train_filepaths = save_to_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_csv_files(test_data, "test", header, n_parts=10)

In [None]:
train_filepaths

['datasets/housing/my_train_00.csv',
 'datasets/housing/my_train_01.csv',
 'datasets/housing/my_train_02.csv',
 'datasets/housing/my_train_03.csv',
 'datasets/housing/my_train_04.csv',
 'datasets/housing/my_train_05.csv',
 'datasets/housing/my_train_06.csv',
 'datasets/housing/my_train_07.csv',
 'datasets/housing/my_train_08.csv',
 'datasets/housing/my_train_09.csv',
 'datasets/housing/my_train_10.csv',
 'datasets/housing/my_train_11.csv',
 'datasets/housing/my_train_12.csv',
 'datasets/housing/my_train_13.csv',
 'datasets/housing/my_train_14.csv',
 'datasets/housing/my_train_15.csv',
 'datasets/housing/my_train_16.csv',
 'datasets/housing/my_train_17.csv',
 'datasets/housing/my_train_18.csv',
 'datasets/housing/my_train_19.csv']

# Preprocessing the Data

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

In [None]:
X_mean, X_std = scaler.mean_, scaler.scale_  # extra code
n_inputs = 8

def parse_csv_line(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    return tf.stack(fields[:-1]), tf.stack(fields[-1:])

def preprocess(line):
    x, y = parse_csv_line(line)
    return (x - X_mean) / X_std, y

In [None]:
preprocess(b'4.2083,44.0,5.3232,0.9171,846.0,2.3370,37.47,-122.2,2.782')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 0.16579159,  1.216324  , -0.05204564, -0.39215982, -0.5277444 ,
        -0.2633488 ,  0.8543046 , -1.3072058 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.782], dtype=float32)>)

# Putting Everything Together

In [None]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                        n_read_threads=None, shuffle_buffer_size=10000,
                        n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)

In [None]:
example_set = csv_reader_dataset(train_filepaths, batch_size=3)
for X_batch, y_batch in example_set.take(2):
    print("X =", X_batch)
    print("y =", y_batch)
    print()

X = tf.Tensor(
[[-0.6295897   0.34613404 -0.3875008  -0.33163127 -0.87088525 -0.26279458
   1.3930546  -0.8674123 ]
 [-0.1970167   0.1879177  -0.20000464 -0.15976463 -0.33062097  0.37057492
  -0.73852515  0.75683004]
 [-0.5118485  -0.60316414  0.40889394  0.08682663 -0.1636136   0.04239108
   0.25465012  0.23207329]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[1.483]
 [1.993]
 [1.142]], shape=(3, 1), dtype=float32)

X = tf.Tensor(
[[-0.2974214   1.8491895  -0.06415064 -0.0634404   0.4213685  -0.19656195
   0.9901622  -1.442145  ]
 [-0.8500925   0.42524222 -0.7109054   0.08409034 -0.566074   -0.55816936
  -1.3569175   1.2116159 ]
 [ 0.4069305   1.216324    0.12785536 -0.1086849  -0.3762514  -0.0852778
  -0.77131855  0.5919061 ]], shape=(3, 8), dtype=float32)
y = tf.Tensor(
[[3.5  ]
 [2.   ]
 [3.443]], shape=(3, 1), dtype=float32)



# Using the Dataset with tf.keras

In [None]:
train_set = csv_reader_dataset(train_filepaths)
valid_set = csv_reader_dataset(valid_filepaths)
test_set = csv_reader_dataset(test_filepaths)

In [None]:
model = tf.keras.Sequential([
    tf.keras.layers.Dense(30, activation="relu", kernel_initializer="he_normal",
                          input_shape=X_train.shape[1:]),
    tf.keras.layers.Dense(1),
])
model.compile(loss="mse", optimizer="sgd")
model.fit(train_set, validation_data=valid_set, epochs=5)

Epoch 1/5


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


    362/Unknown [1m3s[0m 5ms/step - loss: 1.3420



[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step - loss: 1.3396 - val_loss: 0.4977
Epoch 2/5




[1m357/363[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - loss: 0.4825



[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 9ms/step - loss: 0.4823 - val_loss: 1.8989
Epoch 3/5




[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.4334 - val_loss: 10.1109
Epoch 4/5
[1m351/363[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 1ms/step - loss: 1.6940



[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 1.6646 - val_loss: 13.2680
Epoch 5/5
[1m360/363[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 1ms/step - loss: 0.5859



[1m363/363[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.5858 - val_loss: 2.0828




<keras.src.callbacks.history.History at 0x7a22d3b47150>

In [None]:
test_mse = model.evaluate(test_set)
new_set = test_set.take(3)
y_pred = model.predict(new_set)

[1m162/162[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 0.4237




[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step


In [None]:
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
loss_fn = tf.keras.losses.MeanSquaredError()

n_epochs = 5
for epoch in range(n_epochs):
    for X_batch, y_batch in train_set:
        # extra code – perform one Gradient Descent step
        #              as explained in Chapter 12
        print("\rEpoch {}/{}".format(epoch + 1, n_epochs), end="")
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

Epoch 5/5

In [None]:
def train_one_epoch(model, optimizer, loss_fn, train_set):
    for X_batch, y_batch in train_set:
        with tf.GradientTape() as tape:
            y_pred = model(X_batch)
            main_loss = tf.reduce_mean(loss_fn(y_batch, y_pred))
            loss = tf.add_n([main_loss] + model.losses)
        gradients = tape.gradient(loss, model.trainable_variables)
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
loss_fn = tf.keras.losses.MeanSquaredError()
for epoch in range(n_epochs):
    print("\rEpoch {}/{}".format(epoch + 1, n_epochs), end="")
    train_one_epoch(model, optimizer, loss_fn, train_set)

Epoch 5/5

# The TFRecord Format

In [None]:
with tf.io.TFRecordWriter("my_data.tfrecord") as f:
    f.write(b"This is the first record")
    f.write(b"And this is the second record")

In [None]:
filepaths = ["my_data.tfrecord"]
dataset = tf.data.TFRecordDataset(filepaths)
for item in dataset:
    print(item)

tf.Tensor(b'This is the first record', shape=(), dtype=string)
tf.Tensor(b'And this is the second record', shape=(), dtype=string)


# Compressed TFRecord Files

In [None]:
options = tf.io.TFRecordOptions(compression_type="GZIP")
with tf.io.TFRecordWriter("my_compressed.tfrecord", options) as f:
    f.write(b"Compress, compress, compress!")

In [None]:
dataset = tf.data.TFRecordDataset(["my_compressed.tfrecord"],
compression_type="GZIP")

# A Brief Introduction to Protocol Buffers

In [None]:
%%writefile person.proto
syntax = "proto3";
message Person {
    string name = 1;
    int32 id = 2;
    repeated string email = 3;
}

Overwriting person.proto


In [None]:
import os
import sys

IS_COLAB = "google.colab" in sys.modules

if IS_COLAB:
    !apt remove -y protobuf-compiler
    PB_REL="https://github.com/protocolbuffers/protobuf/releases"
    !curl -LO {PB_REL}/download/v25.1/protoc-25.1-linux-x86_64.zip
    !unzip protoc-25.1-linux-x86_64.zip -d /root/.local
    os.environ["PATH"] += ":/root/.local/bin"

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following package was automatically installed and is no longer required:
  libprotoc23
Use 'apt autoremove' to remove it.
The following packages will be REMOVED:
  protobuf-compiler
0 upgraded, 0 newly installed, 1 to remove and 35 not upgraded.
After this operation, 113 kB disk space will be freed.
(Reading database ... 126319 files and directories currently installed.)
Removing protobuf-compiler (3.12.4-1ubuntu7.22.04.2) ...
Processing triggers for man-db (2.10.2-1) ...
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 3032k  100 3032k    0     0  3645k      0 --:--:-- --:--:-- --:--:-- 3645k
Archive:  protoc-25.1-linux-x86_64.zip
  inflating: /root/.local/bin/protoc  
  inflating: /root/.local/in

In [None]:
!protoc person.proto --python_out=. --descriptor_set_out=person.desc --include_imports


In [None]:
%ls person*

person.desc  person_pb2.py  person.proto


In [None]:
from person_pb2 import Person # import the generated access class

person = Person(name="Al", id=123, email=["a@b.com"]) # create a Person
print(person) # display the Person

name: "Al"
id: 123
email: "a@b.com"



In [None]:
person.name

'Al'

In [None]:
person.name = "Alice" # modify a field
person.email[0]

'a@b.com'

In [None]:
person.email.append("c@d.com") # add an email address
s = person.SerializeToString() # serialize the object to a byte string
s

b'\n\x05Alice\x10{\x1a\x07a@b.com\x1a\x07c@d.com'

In [None]:
person2 = Person()
person2.ParseFromString(s)

27

In [None]:
person == person2

True

# TensorFlow Protobufs

In [None]:
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

person_example = Example(
    features=Features(
        feature={
            "name": Feature(bytes_list=BytesList(value=[b"Alice"])),
            "id": Feature(int64_list=Int64List(value=[123])),
            "emails": Feature(bytes_list=BytesList(value=[b"a@b.com",
                                                          b"c@d.com"]))
        }))

In [None]:
with tf.io.TFRecordWriter("my_contacts.tfrecord") as f:
    for _ in range(5):
        f.write(person_example.SerializeToString())

# Loading and Parsing Examples

In [None]:
feature_description = {
    "name": tf.io.FixedLenFeature([], tf.string, default_value=""),
    "id": tf.io.FixedLenFeature([], tf.int64, default_value=0),
    "emails": tf.io.VarLenFeature(tf.string),
}
for serialized_example in tf.data.TFRecordDataset(["my_contacts.tfrecord"]):
    parsed_example = tf.io.parse_single_example(serialized_example,
                                                feature_description)

In [None]:
tf.sparse.to_dense(parsed_example["emails"], default_value=b"")

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.com', b'c@d.com'], dtype=object)>

In [None]:
parsed_example["emails"].values

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b'a@b.com', b'c@d.com'], dtype=object)>

In [None]:
dataset = tf.data.TFRecordDataset(["my_contacts.tfrecord"]).batch(10)
for serialized_examples in dataset:
    parsed_examples = tf.io.parse_example(serialized_examples,
                                          feature_description)

# Handling Lists of Lists Using the SequenceExample Protobuf

In [None]:
from tensorflow.train import FeatureList, FeatureLists, SequenceExample

context = Features(feature={
    "author_id": Feature(int64_list=Int64List(value=[123])),
    "title": Feature(bytes_list=BytesList(value=[b"A", b"desert", b"place", b"."])),
    "pub_date": Feature(int64_list=Int64List(value=[1623, 12, 25]))
})

content = [["When", "shall", "we", "three", "meet", "again", "?"],
           ["In", "thunder", ",", "lightning", ",", "or", "in", "rain", "?"]]
comments = [["When", "the", "hurlyburly", "'s", "done", "."],
            ["When", "the", "battle", "'s", "lost", "and", "won", "."]]

def words_to_feature(words):
    return Feature(bytes_list=BytesList(value=[word.encode("utf-8")
                                               for word in words]))

content_features = [words_to_feature(sentence) for sentence in content]
comments_features = [words_to_feature(comment) for comment in comments]

sequence_example = SequenceExample(
    context=context,
    feature_lists=FeatureLists(feature_list={
        "content": FeatureList(feature=content_features),
        "comments": FeatureList(feature=comments_features)
    }))

In [None]:
sequence_example

context {
  feature {
    key: "title"
    value {
      bytes_list {
        value: "A"
        value: "desert"
        value: "place"
        value: "."
      }
    }
  }
  feature {
    key: "pub_date"
    value {
      int64_list {
        value: 1623
        value: 12
        value: 25
      }
    }
  }
  feature {
    key: "author_id"
    value {
      int64_list {
        value: 123
      }
    }
  }
}
feature_lists {
  feature_list {
    key: "content"
    value {
      feature {
        bytes_list {
          value: "When"
          value: "shall"
          value: "we"
          value: "three"
          value: "meet"
          value: "again"
          value: "?"
        }
      }
      feature {
        bytes_list {
          value: "In"
          value: "thunder"
          value: ","
          value: "lightning"
          value: ","
          value: "or"
          value: "in"
          value: "rain"
          value: "?"
        }
      }
    }
  }
  feature_list {
    key: "c

# Preprocessing the Input Features

In [79]:
class Standardization(tf.keras.layers.Layer):
    def adapt(self, data_sample):
        self.means_ = np.mean(data_sample, axis=0, keepdims=True)
        self.stds_ = np.std(data_sample, axis=0, keepdims=True)
    def call(self, inputs):
        return (inputs - self.means_) / (self.stds_ + tf.keras.backend.epsilon())

# Encoding Categorical Features Using One-Hot Vectors

In [81]:
vocab = ["<1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)
table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [82]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [83]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
cat_one_hot

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

# Encoding Categorical Features Using Embeddings

In [84]:
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)

In [85]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.74961483, 0.332101  ],
       [0.5544692 , 0.17776561],
       [0.554966  , 0.14195704],
       [0.8732903 , 0.95374715],
       [0.97579014, 0.40961385],
       [0.6553699 , 0.8525717 ],
       [0.8483422 , 0.11787355]], dtype=float32)>

In [86]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1])>

In [87]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.8732903 , 0.95374715],
       [0.6553699 , 0.8525717 ],
       [0.5544692 , 0.17776561],
       [0.5544692 , 0.17776561]], dtype=float32)>

In [89]:
embedding = tf.keras.layers.Embedding(input_dim=len(vocab) + num_oov_buckets,
output_dim=embedding_dim)
embedding(cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[ 0.01165948, -0.02557279],
       [ 0.0157169 , -0.00373257],
       [ 0.03906156, -0.03115538],
       [ 0.03906156, -0.03115538]], dtype=float32)>

In [92]:
import tensorflow as tf

# Contoh: kategori string ke indeks
vocab = ["A", "B", "C", "D", "E", "F"]
table = tf.keras.layers.StringLookup(vocabulary=vocab)

# Input numerik dan kategorikal
regular_inputs = tf.keras.layers.Input(shape=[8])
categories = tf.keras.layers.Input(shape=(), dtype=tf.string)

# Lookup kategori ke indeks
cat_indices = tf.keras.layers.Lambda(
    lambda cats: table(cats),
    output_shape=lambda s: s,   # bentuk output sama dengan input
    dtype=tf.int64
)(categories)

# Embedding layer
cat_embed = tf.keras.layers.Embedding(input_dim=len(vocab)+1, output_dim=2)(cat_indices)

# Gabungkan input numerik + embedding
encoded_inputs = tf.keras.layers.Concatenate()([regular_inputs, cat_embed])
outputs = tf.keras.layers.Dense(1)(encoded_inputs)

# Buat model
model = tf.keras.models.Model(inputs=[regular_inputs, categories], outputs=[outputs])


# Keras Preprocessing Layers

In [98]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Normalization, Discretization

data_sample = tf.constant([[1.0, 2.0], [3.0, 4.0]])

normalization = Normalization()
normalization.adapt(data_sample)

discretization = Discretization(bin_boundaries=[2.5])

pipeline = keras.Sequential([
    normalization,
    keras.layers.Lambda(lambda x: tf.stack([x[:, 0], discretization(x[:, 1:2])], axis=1))
])


# TF Transform

In [100]:
!pip install -q tensorflow-transform

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m89.7/89.7 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.0/152.0 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.5/43.5 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m173.5/173.5 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m451.5/451.5 kB[0m [31m16.8 MB/s[0m eta

In [101]:
import tensorflow_transform as tft

def preprocess(inputs): # inputs = a batch of input features
    median_age = inputs["housing_median_age"]
    ocean_proximity = inputs["ocean_proximity"]
    standardized_age = tft.scale_to_z_score(median_age)
    ocean_proximity_id = tft.compute_and_apply_vocabulary(ocean_proximity)
    return {
        "standardized_median_age": standardized_age,
        "ocean_proximity_id": ocean_proximity_id
    }

# The TensorFlow Datasets (TFDS) Project

In [102]:
import tensorflow_datasets as tfds
dataset = tfds.load(name="mnist")
mnist_train, mnist_test = dataset["train"], dataset["test"]



Downloading and preparing dataset Unknown size (download: Unknown size, generated: Unknown size, total: Unknown size) to /root/tensorflow_datasets/mnist/3.0.1...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

Generating splits...:   0%|          | 0/2 [00:00<?, ? splits/s]

Generating train examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/mnist/incomplete.78GNSQ_3.0.1/mnist-train.tfrecord*...:   0%|          | 0…

Generating test examples...: 0 examples [00:00, ? examples/s]

Shuffling /root/tensorflow_datasets/mnist/incomplete.78GNSQ_3.0.1/mnist-test.tfrecord*...:   0%|          | 0/…

Dataset mnist downloaded and prepared to /root/tensorflow_datasets/mnist/3.0.1. Subsequent calls will reuse this data.


In [104]:
mnist_train = mnist_train.shuffle(10000).batch(32).prefetch(1)
for item in mnist_train:
    images = item["image"]
    labels = item["label"]

In [105]:
mnist_train = mnist_train.shuffle(10000).batch(32)
mnist_train = mnist_train.map(lambda items: (items["image"], items["label"]))
mnist_train = mnist_train.prefetch(1)

In [107]:
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras

# Muat dataset MNIST
dataset = tfds.load(name="mnist", batch_size=32, as_supervised=True)
mnist_train = dataset["train"].prefetch(1)

# Bangun model Sequential
model = keras.models.Sequential([
    keras.layers.Rescaling(1./255, input_shape=(28, 28, 1)),  # Normalisasi
    keras.layers.Flatten(),  # Ubah dari 28x28 ke 784
    keras.layers.Dense(128, activation="relu"),
    keras.layers.Dense(10, activation="softmax")  # Kelas MNIST ada 10
])

# Kompilasi model
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer="sgd",
    metrics=["accuracy"]
)

# Training model
model.fit(mnist_train, epochs=5)


Epoch 1/5


  super().__init__(**kwargs)


[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 13ms/step - accuracy: 0.7359 - loss: 1.0152
Epoch 2/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 2ms/step - accuracy: 0.9040 - loss: 0.3473
Epoch 3/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - accuracy: 0.9190 - loss: 0.2905
Epoch 4/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9272 - loss: 0.2586
Epoch 5/5
[1m1875/1875[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 2ms/step - accuracy: 0.9335 - loss: 0.2354


<keras.src.callbacks.history.History at 0x7a22b874a110>

# Exercise Solutions

1. Why would you want to use the Data API?
> Karena tf.data API efisien, scalable, bisa menangani data besar, mendukung pipeline paralel dan prefetching.

2. What are the benefits of splitting a large dataset into multiple files?
> Lebih mudah dibaca paralel, mengurangi I/O bottleneck, memungkinkan distribusi training di beberapa worker.

3. During training, how can you tell that your input pipeline is the bottleneck?
What can you do to fix it?
> GPU/CPU idle, training lambat. Solusinya: gunakan .cache(), .prefetch(), paralel loading dengan .map(..., num_parallel_calls=...).

4. Can you save any binary data to a TFRecord file, or only serialized protocol
buffers?
> Tidak, bisa simpan data biner apapun asalkan dikonversi ke bytes.

5. Why would you go through the hassle of converting all your data to the Example
protobuf format? Why not use your own protobuf definition?
> Karena Example sudah didukung penuh TensorFlow (tools, API, kompatibilitas). Custom protobuf butuh lebih banyak kerja tambahan.

6. When using TFRecords, when would you want to activate compression? Why
not do it systematically?
> Saat file besar dan I/O lambat. Tapi jangan selalu karena kompresi bisa memperlambat decoding dan tidak semua tool mendukungnya.

7. Data can be preprocessed directly when writing the data files, or within the
tf.data pipeline, or in preprocessing layers within your model, or using TF Trans‐
form. Can you list a few pros and cons of each option?
> Saat menulis file: cepat saat training, tapi tidak fleksibel.Di pipeline tf.data: fleksibel dan efisien.Di preprocessing layer: end-to-end training, tapi bisa lambat.Dengan TF Transform: cocok untuk produksi (consistency antara training dan serving), tapi setup lebih rumit.

8. Name a few common techniques you can use to encode categorical features.
What about text?
> Categorical: one-hot, label encoding, embedding.Text: bag-of-words, TF-IDF, embeddings (Word2Vec, GloVe), tokenizer + embedding layer.

9. Load the Fashion MNIST dataset (introduced in Chapter 10); split it into a train‐
ing set, a validation set, and a test set; shuffle the training set; and save each
dataset to multiple TFRecord files. Each record should be a serialized Example
protobuf with two features: the serialized image (use tf.io.serialize_tensor() to serialize each image), and the label.11 Then use tf.data to create an efficient
dataset for each set. Finally, use a Keras model to train these datasets, including a
preprocessing layer to standardize each input feature. Try to make the input
pipeline as efficient as possible, using TensorBoard to visualize profiling data.

In [108]:
#  Load dan Split Dataset
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os

# Load Fashion MNIST
(X_train_full, y_train_full), (X_test, y_test) = keras.datasets.fashion_mnist.load_data()

# Split menjadi training dan validation
X_train, X_valid = X_train_full[:-5000], X_train_full[-5000:]
y_train, y_valid = y_train_full[:-5000], y_train_full[-5000:]


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
[1m29515/29515[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
[1m26421880/26421880[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
[1m5148/5148[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1us/step
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz
[1m4422102/4422102[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step


In [109]:
# 2. Fungsi bantu untuk serialisasi ke tf.train.Example
def serialize_example(image, label):
    feature = {
        "image": tf.train.Feature(bytes_list=tf.train.BytesList(value=[tf.io.serialize_tensor(image).numpy()])),
        "label": tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
    }
    example = tf.train.Example(features=tf.train.Features(feature=feature))
    return example.SerializeToString()


In [110]:
# Simpan ke beberapa file TFRecord
def save_to_tfrecord(images, labels, filename_prefix, n_shards=10):
    os.makedirs("tfrecord_data", exist_ok=True)
    n_samples = len(images)
    shard_size = n_samples // n_shards

    for i in range(n_shards):
        file_path = f"tfrecord_data/{filename_prefix}_{i:02d}.tfrecord"
        with tf.io.TFRecordWriter(file_path) as writer:
            start_idx = i * shard_size
            end_idx = start_idx + shard_size if i < n_shards - 1 else n_samples
            for img, lbl in zip(images[start_idx:end_idx], labels[start_idx:end_idx]):
                writer.write(serialize_example(img, lbl))

In [111]:
save_to_tfrecord(X_train, y_train, "train")
save_to_tfrecord(X_valid, y_valid, "valid")
save_to_tfrecord(X_test, y_test, "test")

In [121]:
#  Fungsi parsing TFRecord
def parse_example(example_proto):
    feature_description = {
        "image": tf.io.FixedLenFeature([], tf.string),
        "label": tf.io.FixedLenFeature([], tf.int64)
    }
    parsed = tf.io.parse_single_example(example_proto, feature_description)
    image = tf.io.parse_tensor(parsed["image"], out_type=tf.uint8)
    image = tf.reshape(image, [28, 28])  # Kembali ke bentuk gambar 28x28
    image = tf.cast(image, tf.float32) / 255.0
    return image, parsed["label"]


In [122]:
for x, y in train_set.take(1):
    print("image shape:", x.shape)


image shape: (32, 8)


In [117]:
# 5.Load TFRecord sebagai tf.data.Dataset
def load_dataset(file_pattern, batch_size=32, shuffle_buffer=10000):
    files = tf.data.Dataset.list_files(file_pattern)
    dataset = files.interleave(
        lambda file: tf.data.TFRecordDataset(file).map(parse_example),
        cycle_length=4, num_parallel_calls=tf.data.AUTOTUNE
    )
    dataset = dataset.shuffle(shuffle_buffer)
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)
    return dataset

In [118]:
# 6. Buat dan latih model
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=[28, 28]),
    keras.layers.Normalization(),  # preprocessing layer
    keras.layers.Dense(100, activation="relu"),
    keras.layers.Dense(10, activation="softmax")
])
model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam",
              metrics=["accuracy"])


  super().__init__(**kwargs)
