## Setup

In [1]:
import tensorflow as tf
from tfx.components import CsvExampleGen, StatisticsGen, SchemaGen, ExampleValidator, Transform, Trainer, Tuner
from tfx.proto import example_gen_pb2
from tfx.orchestration.experimental.interactive.interactive_context import InteractiveContext
import os

## Set Variable

In [None]:
import pandas as pd

df = pd.read_csv("hf://datasets/dwisaji/indonesia-telecomunication-sentiment-dataset/dataset4.csv")


# Tentukan path folder 'data' di direktori saat ini
save_dir = "data"
os.makedirs(save_dir, exist_ok=True)  # Buat folder jika belum ada

# Simpan file CSV dalam folder 'data'
save_path = os.path.join(save_dir, "data.csv")
df.to_csv(save_path, index=False)

print(f"File disimpan di {save_path}")

In [3]:
PIPELINE_NAME = "indo-tele-pipeline"
SCHEMA_PIPELINE_NAME = "indo-tele-tfdv-schema"

#Directory untuk menyimpan artifact yang akan dihasilkan
PIPELINE_ROOT = os.path.join('pipelines', PIPELINE_NAME)

# Path to a SQLite DB file to use as an MLMD storage
METADATA_PATH = os.path.join('metadata', PIPELINE_NAME, 'metadata.db')

# Output directory where createdmodels from the pipeline will be exported
SERVING_MODEL_DIR = os.path.join('serving_model', PIPELINE_NAME)

DATA_ROOT = "data"

os.makedirs("module", exist_ok=True)

In [None]:
interactive_context = InteractiveContext(pipeline_root = PIPELINE_ROOT)

## Data Ingestion

In [5]:
output = example_gen_pb2.Output(
    split_config = example_gen_pb2.SplitConfig(splits=[
        example_gen_pb2.SplitConfig.Split(name='train', hash_buckets=8),
        example_gen_pb2.SplitConfig.Split(name='eval', hash_buckets=2)
    ])
)

example_gen = CsvExampleGen(input_base=DATA_ROOT, output_config=output)

In [None]:
interactive_context.run(example_gen)

## Data Validation
### 1. Summary Statistic

In [None]:
statistics_gen = StatisticsGen(
    examples = example_gen.outputs['examples']
)

interactive_context.run(statistics_gen)

In [None]:
interactive_context.show(statistics_gen.outputs["statistics"])

### 2. Data Schema

In [None]:
schema_gen = SchemaGen(
    statistics = statistics_gen.outputs["statistics"]
)
interactive_context.run(schema_gen)

In [None]:
interactive_context.show(schema_gen.outputs['schema'])

### 3. Identifikasi Anomali

In [None]:
example_validator = ExampleValidator(
    statistics = statistics_gen.outputs["statistics"],
    schema = schema_gen.outputs["schema"]
)
interactive_context.run(example_validator)

In [None]:
interactive_context.show(example_validator.outputs["anomalies"])

## Data Preprocessing

In [13]:
TRANSFORM_MODULE_FILE = 'module/data_transform.py'

In [None]:
%%writefile {TRANSFORM_MODULE_FILE}

import tensorflow as tf
import tensorflow_transform as tft

LABEL_KEY = 'label'
FEATURE_KEY = 'text'
def transformed_name(key):
    return key + "_xf"

def preprocessing_fn(inputs):
    
    outputs = {}
    # Pastikan teks dalam bentuk DenseTensor sebelum transformasi
    text_input = inputs[FEATURE_KEY]
    
    outputs[transformed_name(FEATURE_KEY)] = tf.strings.lower(text_input)
    outputs[transformed_name(LABEL_KEY)] = tf.cast(inputs[LABEL_KEY], tf.int64)
    return outputs

In [None]:
transform = Transform(
    examples = example_gen.outputs['examples'],
    schema = schema_gen.outputs['schema'],
    module_file = TRANSFORM_MODULE_FILE,
)

interactive_context.run(transform)

## Pengembangan Model

### Tuner

In [16]:
TUNER_MODULE_FILE = "module/tuner.py"

In [None]:
%%writefile {TUNER_MODULE_FILE}

from typing import Any, Dict, NamedTuple, Text
import keras_tuner as kt
import tensorflow as tf
import tensorflow_transform as tft
from keras_tuner.engine import base_tuner
from tensorflow.keras import layers
from tfx.components.trainer.fn_args_utils import FnArgs
from tensorflow.keras.layers import LeakyReLU

LABEL_KEY = "label"
FEATURE_KEY = "text"
VOCAB_SIZE = 10000
epochs = 5

TunerFnResult = NamedTuple(
    "TunerFnResult", [("tuner", base_tuner.BaseTuner), ("fit_kwargs", Dict[Text, Any])]
)

def transformed_name(key):
    return key + "_xf"

def gzip_reader_fn(filenames):
    return tf.data.TFRecordDataset(filenames, compression_type='GZIP')

def input_fn(
    file_pattern, 
    tf_transform_output, 
    num_epochs, 
    batch_size=64) -> tf.data.Dataset:

    # Get post_transform feature spec
    transform_feature_spec = (
        tf_transform_output.transformed_feature_spec().copy()
    )

    # Create bacthes of data
    dataset = tf.data.experimental.make_batched_features_dataset(
        file_pattern = file_pattern,
        batch_size = batch_size,
        features = transform_feature_spec,
        reader = gzip_reader_fn,
        num_epochs = num_epochs,
        label_key = transformed_name(LABEL_KEY)
    ).repeat(epochs)

    return dataset
 

def model_builder(hp, vectorizer):
    ### Define parameter yang digunakan untuk tuning
    num_layer = hp.Int("num_layer", min_value=1, max_value=5, step=1)
    embed_dim = hp.Int("embed_dim", min_value=16, max_value=128, step=32)
    fc_layer = hp.Int("fc_layer", min_value=32, max_value=128, step=16)
    lr = hp.Choice("lr", values=[1e-2, 1e-3, 1e-4])

    inputs = tf.keras.Input(shape=(1,), name=transformed_name(FEATURE_KEY), dtype=tf.string)
    x = vectorizer(inputs)
    x = layers.Embedding(VOCAB_SIZE, embed_dim, name="embedding")(x)
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(0.2)(x)
    for _ in range(num_layer):
        x = layers.Dense(fc_layer, activation='relu')(x)
    
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(3, activation='softmax')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(
        loss = tf.keras.losses.SparseCategoricalCrossentropy(),
        optimizer = tf.keras.optimizers.Adam(learning_rate=lr),
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
    )

    model.summary()
    return model

def tuner_fn(fn_args):
    # Membuat training dan validation datasetyang telah di preprocessing
    tf_transform_output = tft.TFTransformOutput(fn_args.transform_graph_path)
    train_set = input_fn(fn_args.train_files[0], tf_transform_output, num_epochs=epochs)
    val_set = input_fn(fn_args.eval_files[0], tf_transform_output, num_epochs=epochs)

    vectorize_layer = layers.TextVectorization(
        standardize = "lower_and_strip_punctuation",
        max_tokens = 200,
        output_mode = 'int',
        output_sequence_length = 100
    )

    vectorize_layer.adapt(train_set.map(lambda x, _: x[transformed_name(FEATURE_KEY)]))

    # Callback untuk early stopping
    stop_early = tf.keras.callbacks.EarlyStopping(
        monitor='val_sparse_categorical_accuracy', 
        mode='max', 
        verbose=1, 
        patience=15
    )

    # Mendefinisikan strategi hyperparameter tuning
    # tuner = kt.RandomSearch(
    #     hypermodel = lambda hp: model_builder(hp, vectorize_layer),
    #     objective = 'val_sparse_categorical_accuracy',
    #     max_trials = epochs,
    #     seed = 28,
    # )
    tuner = kt.Hyperband(
        lambda hp: model_builder(hp, vectorize_layer),
        objective = 'val_sparse_categorical_accuracy',
        max_epochs = epochs,
        factor = 3,
        directory = fn_args.working_dir,
        project_name = 'kt_hyperband'
    )

    return TunerFnResult(
        tuner = tuner,
        fit_kwargs = {
            "callbacks": [stop_early],
            'x' : train_set,
            'validation_data': val_set,
            'steps_per_epoch': fn_args.train_steps,
            'validation_steps': fn_args.eval_steps
        }
    )


In [None]:
from tfx.components import Tuner
from tfx.proto import trainer_pb2

tuner = Tuner(
    module_file = TUNER_MODULE_FILE,
    examples = transform.outputs['transformed_examples'],
    transform_graph = transform.outputs['transform_graph'],
    schema = schema_gen.outputs['schema'],
    train_args = trainer_pb2.TrainArgs(splits=['train'], num_steps=500),
    eval_args = trainer_pb2.EvalArgs(splits=['eval'], num_steps=100)
)

interactive_context.run(tuner)

### Trainer

In [19]:
TRAINER_MODULE_FILE = "module/indo_tele_sentiment.py"

In [None]:
%%writefile {TRAINER_MODULE_FILE}

import os
import tensorflow as tf
import tensorflow_hub as hub
# import tensorflow_text as text
import tensorflow_transform as tft
from tensorflow.keras import layers
from tfx.components.trainer.fn_args_utils import FnArgs

LABEL_KEY = "label"
FEATURE_KEY = "text"

def transformed_name(key):
    return key + "_xf"

def gzip_reader_fn(filenames):
    return tf.data.TFRecordDataset(filenames, compression_type='GZIP')

def input_fn(
    file_pattern, 
    tf_transform_output, 
    num_epochs, 
    batch_size=64) -> tf.data.Dataset:

    # Get post_transform feature spec
    transform_feature_spec = (
        tf_transform_output.transformed_feature_spec().copy()
    )

    # Create bacthes of data
    dataset = tf.data.experimental.make_batched_features_dataset(
        file_pattern = file_pattern,
        batch_size = batch_size,
        features = transform_feature_spec,
        reader = gzip_reader_fn,
        num_epochs = num_epochs,
        label_key = transformed_name(LABEL_KEY)
    )

    return dataset
 
# Vocab size and number of words in a sequence.
VOCAB_SIZE = 10000
SEQUENCE_LENGTH = 100
embed_dim = 16
epochs = 25

def model_builder(hp, vectorizer):
    """Build ML model"""
    inputs = tf.keras.Input(shape=(1,), name=transformed_name(FEATURE_KEY), dtype=tf.string)
    x = vectorizer(inputs)
    x = layers.Embedding(VOCAB_SIZE, hp.get("embed_dim"), name="embedding")(x)
    x = layers.GlobalAveragePooling1D()(x)

    for _ in range(hp.get("num_layer")):
        x = layers.Dense(hp.get("fc_layer"), activation='relu')(x)
    
    x = layers.Dropout(0.2)(x)
    outputs = layers.Dense(3, activation='softmax')(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)
    model.compile(
        loss = tf.keras.losses.SparseCategoricalCrossentropy(),
        optimizer = tf.keras.optimizers.Adam(learning_rate=hp.get("lr")),
        metrics = [tf.keras.metrics.SparseCategoricalAccuracy()]
    )

    model.summary()
    return model


def _get_serve_tf_examples_fn(model, tf_transform_output):
    model.tft_layer = tf_transform_output.transform_features_layer()

    @tf.function
    def serve_tf_examples_fn(serialized_tf_examples):
        feature_spec = tf_transform_output.raw_feature_spec()

        feature_spec.pop(LABEL_KEY)

        parsed_features = tf.io.parse_example(serialized_tf_examples, feature_spec)

        transformed_features = model.tft_layer(parsed_features)

        # get predictions using the transformed features
        return model(transformed_features)
    return serve_tf_examples_fn

def run_fn(fn_args: FnArgs) -> None:

    log_dir = os.path.join(os.path.dirname(fn_args.serving_model_dir), 'logs')

    tensorboard_callback = tf.keras.callbacks.TensorBoard(
        log_dir = log_dir, update_freq='batch'
    )

    es = tf.keras.callbacks.EarlyStopping(
        monitor='val_sparse_categorical_accuracy', 
        mode='max', 
        verbose=1, 
        patience=10
    )
    mc = tf.keras.callbacks.ModelCheckpoint(
        fn_args.serving_model_dir, 
        monitor='val_sparse_categorical_accuracy', 
        mode='max', 
        verbose=1, 
        save_best_only=True
    )
    hp = fn_args.hyperparameters.get("values")

    # Load the transform output
    tf_transform_output = tft.TFTransformOutput(fn_args.transform_graph_path)

    # Create bacthes of data
    train_set = input_fn(fn_args.train_files, tf_transform_output, hp.get("tuner/epochs"))
    val_set = input_fn(fn_args.eval_files, tf_transform_output, hp.get("tuner/epochs"))
    vectorize_layer = layers.TextVectorization(
        standardize = "lower_and_strip_punctuation",
        max_tokens = VOCAB_SIZE,
        output_mode = 'int',
        output_sequence_length = SEQUENCE_LENGTH
    )
    vectorize_layer.adapt(train_set.map(lambda x, _: x[transformed_name(FEATURE_KEY)]))


    # Build Model
    model = model_builder(hp, vectorize_layer)

    # Train the model
    model.fit(
        x = train_set.repeat(),
        validation_data = val_set.repeat(),
        callbacks = [tensorboard_callback, es, mc],
        steps_per_epoch = 100,
        validation_steps = 100,
        epochs = hp.get("tuner/epochs")
    )

    signatures = {
        'serving_default' : _get_serve_tf_examples_fn(model, tf_transform_output).get_concrete_function(
            tf.TensorSpec(
            shape=[None],
            dtype = tf.string,
            name = 'examples'
            )
        )
    }

    model.save(fn_args.serving_model_dir, save_format = 'tf', signatures=signatures)


In [None]:
from tfx.proto import trainer_pb2

trainer = Trainer(
    module_file = os.path.abspath(TRAINER_MODULE_FILE),
    examples = transform.outputs['transformed_examples'],
    transform_graph = transform.outputs['transform_graph'],
    schema = schema_gen.outputs['schema'],
    hyperparameters= tuner.outputs['best_hyperparameters'],
    train_args = trainer_pb2.TrainArgs(splits=['train']),
    eval_args = trainer_pb2.EvalArgs(splits=['eval'])
)
interactive_context.run(trainer)

## Analisis dan Validasi Model 

In [None]:
# Melakukan analisis dan validasi dengan menyediakan baseline model menggunakan Resolver

from tfx.dsl.components.common.resolver import Resolver
from tfx.dsl.input_resolution.strategies.latest_blessed_model_strategy import LatestBlessedModelStrategy
from tfx.types import Channel
from tfx.types.standard_artifacts import Model, ModelBlessing

model_resolver = Resolver(
    strategy_class = LatestBlessedModelStrategy,
    model = Channel(type=Model),
    model_blessing = Channel(type=ModelBlessing)
).with_id('Latest_blessed_model_resolver')

interactive_context.run(model_resolver)

In [20]:
# Konfigurasi untuk mengevaluasi model

import tensorflow_model_analysis as tfma

eval_config = tfma.EvalConfig(
    model_specs = [tfma.ModelSpec(label_key='label')],
    slicing_specs = [tfma.SlicingSpec()],
    metrics_specs = [
        tfma.MetricsSpec(metrics=[
            tfma.MetricConfig(class_name='ExampleCount'),
            tfma.MetricConfig(class_name='AUC'),
            tfma.MetricConfig(class_name='FalsePositives'),
            tfma.MetricConfig(class_name='TruePositives'),
            tfma.MetricConfig(class_name='FalseNegatives'),
            tfma.MetricConfig(class_name='TruePositives'),
            tfma.MetricConfig(class_name='BinaryAccuracy',
                              threshold = tfma.MetricThreshold(
                                  value_threshold = tfma.GenericValueThreshold(
                                      lower_bound = {'value':0.5}
                                  ),
                                  change_threshold = tfma.GenericChangeThreshold(
                                      direction = tfma.MetricDirection.HIGHER_IS_BETTER,
                                      absolute={'value': 0.0001}
                                  )
                              ))
        ])
    ]
)

In [None]:
from tfx.components import Evaluator
evaluator = Evaluator(
    examples = example_gen.outputs['examples'],
    model = trainer.outputs['model'],
    baseline_model = model_resolver.outputs['model'],
    eval_config = eval_config
)

interactive_context.run(evaluator)

In [None]:
# Visualisasi hasil evaluasi
eval_result = evaluator.outputs['evaluation'].get()[0].uri
tfma_result = tfma.load_eval_result(eval_result)
tfma.view.render_slicing_metrics(tfma_result)
tfma.addons.fairness.view.widget_view.render_fairness_indicator(
    tfma_result
)