In [None]:
import os
import pandas as pd
import tensorflow as tf
from absl import logging
from modules import components, pipeline
from modules.utils import merge_dataset
from tfx.orchestration.beam.beam_dag_runner import BeamDagRunner

In [None]:
merge_dataset("data/movies/movies.csv", "data/ratings/ratings.csv")

In [None]:
PIPELINE_NAME = "cbf_pipeline"

# pipeline inputs
DATA_ROOT = "data/merge"
TRANSFORM_MODULE_FILE = "modules/cbf_transform.py"
TUNER_MODULE_FILE = "modules/cbf_tuner.py"
TRAINER_MODULE_FILE = "modules/cbf_trainer.py"

# pipeline outputs
OUTPUT_BASE = "outputs"

serving_model_dir = os.path.join(OUTPUT_BASE, "serving_model")
pipeline_root = os.path.join(OUTPUT_BASE, PIPELINE_NAME)
metadata_path = os.path.join(pipeline_root, "metadata.sqlite")


In [None]:
components = components.init_components(
    data_dir=DATA_ROOT,
    transform_module=TRANSFORM_MODULE_FILE,
    tuner_module=TUNER_MODULE_FILE,
    trainer_module=TRAINER_MODULE_FILE,
    train_steps=1000,
    eval_steps=500,
    serving_model_dir=os.path.join(
        serving_model_dir, "cbf_model"),
    epochs=5,
)

pipeline = pipeline.init_pipeline(
    pipeline_root=pipeline_root,
    pipeline_name=PIPELINE_NAME,
    metadata_path=metadata_path,
    components=components,
)

BeamDagRunner().run(pipeline)