In [None]:
import pandas as pd
import tensorflow as tf

from tensorflow.keras import layers
from tensorflow import feature_column

from os import getcwd
from sklearn.model_selection import train_test_split

# From Numpy File (.npz)

In [None]:
# Fetch the numpy dataset
DATA_URL = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz"
path = tf.keras.utils.get_file("mnist.npz", DATA_URL)

# Extract train, test sets
with np.load(path) as data:
    train_examples = data["x_train"]
    train_labels = data["y_train"]
    test_examples = data["x_test"]
    test_labels = data["y_test"]
    
# Load them with tf.data
train_dataset = tf.data.Dataset.from_tensor_slices((train_examples, train_labels))
test_dataset = tf.data.Dataset.from_tensor_slices((test_examples, test_labels))

# Apply transformations like batch, shuffle to the dataset
train_dataset = train_dataset.shuffle(100).batch(64)
test_dataset = test_dataset.batch(64)

X, y = next(iter(train_dataset))
input_shape = X.numpy().shape[1:]

# Create a simple sequential model comprising of a Dense layer
model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=input_shape),
    ...
    tf.keras.layers.Dense(10, activation='softmax')    
])
model.compile(...)

# Train the model
model.fit(train_dataset, epochs=10)



# From CSV

In [None]:
# Cara ke-1, using pandas

csv_file = tf.keras.utils.get_file(
    "heart.csv", 
    "https://storage.googleapis.com/applied-dl/heart.csv",
)

df = pd.read_csv(csv_file)
df["thal"] = pd.Categorical(df["thal"])
df["thal"] = df.thal.cat.codes

target = df.pop("target")

dataset = tf.data.Dataset.from_tensor_slices((df.values, target.values))
train_dataset = dataset.shuffle(len(df)).batch(32)

model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(10, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid"),
])

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'])

model.git(train_dataset, epochs=15)

In [None]:
# Cara ke-2
csv_file = tf.keras.utils.get_file(
    "heart.csv", 
    "https://storage.googleapis.com/applied-dl/heart.csv",
)

df = pd.read_csv(csv_file)

target = df.pop("target")

dict_slices = tf.data.Dataset.from_tensor_slices((df.to_dict('list'), target.values)).batch(16)

for features, target in tfds.as_numpy(dict_slices.take(1)):
    for (feature, value), label in zip(features.items(), target):
        print("{} = {}\t Label = {}".format(feature, value, label))
        
# Constructing the inputs for all the dense features
inputs = {key: tf.keras.layers.Input(shape=(), name=key) for key in df.keys()}
x = tf.stack(list(inputs.values()), axis=1)
x = tf.keras.layers.Dense(10, activation='relu')(x)

# The single output denoting the target's probability
output = tf.keras.layers.Dense(1, activation='sigmoid')(x)

model = tf.keras.Model(inputs=inputs, outputs=output)
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

model.fit(dict_slices, epochs=15)

In [None]:
# Cara ke-3

# Loading CSV
train_file_path = tf.keras.utils.get_file(
    "train.csv",
    "https://storage.googleapis.com/tf-datasets/titanic/train.csv"
)
test_file_path = tf.keras.utils.get_file(
    "Eval.csv",
    "https://storage.googleapis.com/tf-datasets/titanic/eval.csv"
)

# Extract the data
def get_dataset(file_path, **kwargs):
    dataset = tf.data.experimental.make_csv_dataset(
        file_path,
        batch_size=5 # Artificially small to make examples easier to show
        label_name="survived",
        na_value="?",
        num_epochs=1,
        ignore_errors=True,
        **kwargs
    )
    return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

# Shows the feature key and number of samples for that key. 
def show_batch(dataset):
    for batch, label in dataset.take(1):
        for key, value in batch.items():
            print("{:20s}: {}".format(key, value.numpy()))
show_batch(get_dataset(train_file_path))

In [None]:
# Getting data from named columns
## cara ke-1
CSV_COLUMNS = [
    "survived",
    "sex",
    "age",
    "n_siblings_spouses",
    "parch",
    "fare",
    "class",
    "deck",
    "embark_town",
    "alone"
]
temp_dataset = get_dataset(train_file_path, column_names=CSV_COLUMNS)

show_batch(temp_dataset)

## cara ke-2
SELECT_COLUMNS = [
    "survived",
    "age",
    "n_siblings_spouses",
    "class",
    "deck",
    "alone"
]
temp_dataset = get_dataset(train_file_path, select_columns=SELECT_COLUMNS)
show_batch(temp_dataset)

## Cara ke-3 (menentukan type data secara langsung)
SELECT_COLUMNS = [
    "survived",
    "age",
    "n_sibilings_spouses",
    "parch",
    "fare"
]
DEFAULTS = [0, 0.0, 0.0, 0.0, 0.0]
temp_dataset = get_dataset(
    train_file_path,
    select_columns=SELECT_COLUMNS,
    colomn_defaults=DEFAULTS
)

def pack(features, labels): # Function that will pack together all the columns
    return tf.stack(list(features.values()), axis=-1), label

packed_dataset = temp_dataset.map(pack)
## end 

# Packing numeric features

NUMERIC_FEATURES = [
    "age",
    "n_siblings_spouses",
    "parch",
    "fare"
]

class PackNumericFeatures(object):
    def __init__(self, names):
        self.names = names
        
    def __call__(self, features, labels):
        numeric_features = [features.pop(name) for name in self.names]
        numeric_features = [tf.cast(feat, tf.float32) for feat in numeric_features]
        numeric_features = tf.stack(numeric_features, axis=-1)
        features["numeric"] = numeric_features
        return features, labels

packed_train_data = raw_train_data.map(
    PackNumericFeatures(NUMERIC_FEATURES)
)
packed_test_data = raw_test_data.map(
    PackNumericFeatures(NUMERIC_FEATURES)
)

show_batch(packed_train_data)

# Normalizing Numeric features

NUMERIC_FEATURES = [
    "age",
    "n_siblings_spouses",
    "parch",
    "fare"
]

def normalize_numeric_data(data, mean, std):
    return (data-mean)/std

desc = pd.read_csv(train_file_path)[NUMERIC_FEATURES].describe()

MEAN, STD = np.array(desc.T['mean']), np.array(desc.T['std'])

normalizer = functools.partial(normalize_numeric_data, mean=MEAN, std=STD)

numeric_column = tf.feature_column.numeric_column(
    'numeric',
    normalizer_fn=normalizer,
    shape=[len(NUMERIC_FEATURES)]
)

# Handle categorical features

CATEGORIES = {
    "sex": ["male", "female"],
    "class": ["First", "Second", "Third"],
    "deck": ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J"],
    "embark_town": ["Cherbourg", "Southhampton", "Queenstown"],
    "alone": ["y", "n"]
}

cat_feature_col = tf.feature_column.categorical_column_with_vocabulary_list(
    key="class",
    vocabulary_list=["First", "Second", "Third"]
)

categorical_column = tf.feature_column.indicator_column(cat_feature_col)

# Training Model

dense_features = tf.keras.layers.DenseFeatures(categorical_column+numeric_column)

model = tf.keras.Sequential([
    dense_features,
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid'),
])

model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model.fit(packed_train_data, epochs=20)

# From Image Data

In [None]:
DATA_URL = "insert URL here"
data_root_orig = tf.keras.utils.get_file(
    origin=DATA_URL,
    fname="flower_photos",
    untar=True
)
data_root = pathlib.Path(data_root_orig)

# Load all the file paths in the directory
all_image_paths = list(data_root.glob('*/*'))
all_image_paths = [str(path) for path in all_image_paths]

# Gather the list of labels and create a labelmap
label_names = sorted(item.name for item in data_root.glob('*/') if item.is_dir())
label_to_index = dict((name, index) for index, name in enumerate(label_names))

# Use the label map to fetch all categorical labels
all_image_labels = [label_to_index[pathlib.Path(path).parent.name] for path in all_image_labels]

path_ds = tf.data.Dataset.from_tensor_slices(all_image_paths)
label_ds = tf.data.Dataset.from_tensor_slices(all_image_labels)

def preprocess_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_jpeg(image, channels=3)
    image = tf.image.resize(image, [192, 192])
    image /= 255.0 # normalize to [0,1] range
    return image

image_ds = path_ds.map(preprocess_image)
image_label_ds = tf.data.Dataset.zip((image_ds, label_ds))

BATCH_SIZE = 32
ds = image_label_ds.shuffle(buffer_size=len(all_image_paths)).repeat().batch(BATCH_SIZE)
steps_per_epoch = tf.math.ceil(len(all_image_paths) / BATCH_SIZE).numpy()
model.fit(ds, epochs=1, steps_per_epoch=steps_per_epoch)





Details on choosing an appropriate value for steps_per_epoch (from the 
tf.keras.Model documentation https://www.tensorflow.org/api_docs/python/tf/keras/Model?version=stable
)

steps_per_epoch: 

Integer or None. 

Total number of steps (batches of samples) before declaring one epoch finished and starting the next epoch. 

When training with input tensors such as TensorFlow data tensors, the default None is equal to the number of samples in your dataset divided by the batch size, or 1 if that cannot be determined. 

If x is a tf.data dataset, and 'steps_per_epoch' is None, the epoch will run until the input dataset is exhausted.

# From Text Data

In [None]:
DIRECTORY_URL = "https://storage.googleapis.com/download.tensorflow.org/data/illiad/"
FILE_NAMES = ["cowper.txt", "derby.txt", "butler.txt"]

def labeler(example, index):
    return example, tf.cast(index, tf.int64)

labeled_data_sets = []
for i, file_name in enumerate(FILE_NAMES):
    file_path = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+file_name)
    lines_dataset = tf.data.TextLineDataset(file_path) # used to load text from a file into a dataset
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex, i))
    labeled_data_sets.append(labeled_dataset)

# Preparing the dataset

dataset = labeled_data_sets[0]
for labeled_dataset in labeled_data_sets[1:]:
    dataset = dataset.concatenate(labeled_dataset)

dataset = dataset.shuffle(buffer_size=50000)

for ex in dataset.take(5):
    print(ex[0].numpy(), ex[1].numpy())
    
# Text Encoding

tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)
    
vocab_size = len(vocabulary_set)

# Encode an example

original_text = next(iter(all_labeled_data))[0].numpy() # Show one of the labeled data
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set) # Create an text encoder with a fixed vocabulary set
encoded_text = encoder.encode(original_text) # Encode an example

# Encode all the examples

def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label

def encode_map_fn(text, label):
    return tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

all_encoded_data = all_labeled_data.map(encode_map_fn)

# Prepare the dataset

BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(BATCH_SIZE, padded_shapes=([-1],[]))

# Training the model

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.Sequential([tf.keras.layers.Dense(units, activation='relu') for units in [64,64]]),
    tf.keras.layers.Dense(3, activation='softmax')
])

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.fit(train_data, epochs=3)