zenml-io · htahir1 · Apr 27, 2022 · Apr 26, 2022 · Apr 26, 2022 · Apr 26, 2022
diff --git a/docs/book/features/integrations.md b/docs/book/features/integrations.md
@@ -41,6 +41,7 @@ These are the third-party integrations that ZenML currently supports:
 | Hugging Face              | ✅      | Materializer           | Use Hugging Face tokenizers, datasets and models.                                             | [huggingface](https://github.com/zenml-io/zenml/tree/main/examples/huggingface)                                                                          |
 | KServe                    | ⛏      | Inference              | Looking for community implementors.                                                           |                                                                                                                                                          |
 | Kubeflow                  | ✅      | Orchestrator           | Either full Kubeflow or Kubeflow Pipelines. Works for local environments currently.           | [kubeflow](https://github.com/zenml-io/zenml/tree/main/examples/kubeflow)                                                                                |
+| lightgbm                  | ✅      | Training               | Support for `Booster` and `Dataset` materialization.                                          | [lightgbm](https://github.com/zenml-io/zenml/tree/main/examples/lightgbm)                                                                                |
 | MLflow Tracking           | ✅      | Experiment Tracking    | Tracks your pipelines and your training runs.                                                 | [mlflow](https://github.com/zenml-io/zenml/tree/main/examples/mlflow_tracking)                                                                           |
 | MLflow Deployment         | ✅      | Deployment             | Deploys models with the MLflow scoring server.                                                | [mlflow](https://github.com/zenml-io/zenml/tree/main/examples/mlflow_deployment)                                                                         |
 | NeuralProphet             | ✅      | Training               | Enables materializing NeuralProphet models.                                                   | [neural_prophet](https://github.com/zenml-io/zenml/tree/main/examples/neural_prophet)                                                                    |

diff --git a/examples/lightgbm/README.md b/examples/lightgbm/README.md
@@ -0,0 +1,57 @@
+# Gradient Boosting with LightGBM and ZenML
+
+[LightGBM](https://lightgbm.readthedocs.io/en/latest/) is a gradient boosting framework that uses tree-based learning algorithms. It is designed to be distributed and efficient with the following advantages:
+
+- Faster training speed and higher efficiency.
+- Lower memory usage.
+- Better accuracy.
+- Support for parallel, distributed, and GPU learning.
+- Capable of handling large-scale data.
+
+This example showcases how to train a `lightgbm.Booster` model in a ZenML pipeline. The ZenML LightGBM integration includes a custom materializer that persists the trained `lightgbm.Booster` model to and from the artifact store. It also includes materializers for the custom `LightGBM.Dataset` data object.
+
+The data used in this example is the quickstart LightGBM data and is available in the [simple python example of the LightGBM repository](https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py).
+
+## Run it locally
+
+### Pre-requisites
+In order to run this example, you need to install and initialize ZenML:
+
+```shell
+# install CLI
+pip install zenml
+
+# install ZenML integrations
+zenml integration install lightgbm -f
+
+# pull example
+zenml example pull lightgbm
+cd zenml_examples/lightgbm
+
+# initialize
+zenml init
+```
+
+### Run the project
+Now we're ready. Execute:
+
+```shell
+python run.py
+```
+
+### Clean up
+In order to clean up, delete the remaining ZenML references.
+
+```shell
+rm -rf zenml_examples
+```
+
+## SuperQuick `lightgbm` run
+
+If you're really in a hurry and you want just to see this example pipeline run,
+without wanting to fiddle around with all the individual installation and
+configuration steps, just run the following:
+
+```shell
+zenml example run lightgbm
+```
diff --git a/examples/lightgbm/__init__.py b/examples/lightgbm/__init__.py
@@ -0,0 +1,13 @@
+#  Copyright (c) ZenML GmbH 2022. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       https://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
diff --git a/examples/lightgbm/run.py b/examples/lightgbm/run.py
@@ -0,0 +1,118 @@
+#  Copyright (c) ZenML GmbH 2022. All Rights Reserved.
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+import tempfile
+
+import lightgbm as lgb
+import numpy as np
+import pandas as pd
+import requests
+
+from zenml.integrations.constants import LIGHTGBM
+from zenml.logger import get_logger
+from zenml.pipelines import pipeline
+from zenml.steps import BaseStepConfig, Output, step
+
+logger = get_logger(__name__)
+
+TRAIN_SET_RAW = "https://raw.githubusercontent.com/microsoft/LightGBM/master/examples/regression/regression.train"
+TEST_SET_RAW = "https://raw.githubusercontent.com/microsoft/LightGBM/master/examples/regression/regression.test"
+
+
+class LightGBMConfig(BaseStepConfig):
+    boosting_type: str = "gbdt"
+    objective: str = "regression"
+    num_leaves: int = 31
+    learning_rate: float = 0.05
+    feature_fraction: float = 0.9
+    bagging_fraction: float = 0.8
+    bagging_freq: int = 5
+    verbose: int = 0
+
+
+@step
+def data_loader() -> Output(mat_train=lgb.Dataset, mat_test=lgb.Dataset):
+    """Retrieves the data from the demo directory of the LightGBM repo."""
+    # Write data to temporary files to load it with `lgb.Dataset`.
+    with tempfile.NamedTemporaryFile(
+        mode="w", delete=False, suffix=".html", encoding="utf-8"
+    ) as f:
+        f.write(requests.get(TRAIN_SET_RAW).text)
+        df_train = pd.read_csv(f.name, header=None, sep="\t")
+
+    with tempfile.NamedTemporaryFile(
+        mode="w", delete=False, suffix=".html", encoding="utf-8"
+    ) as f:
+        f.write(requests.get(TEST_SET_RAW).text)
+        df_test = pd.read_csv(f.name, header=None, sep="\t")
+
+    # Parse data
+    y_train = df_train[0]
+    y_test = df_test[0]
+    X_train = df_train.drop(0, axis=1)
+    X_test = df_test.drop(0, axis=1)
+
+    # create dataset for lightgbm
+    mat_train = lgb.Dataset(X_train, y_train)
+    mat_test = lgb.Dataset(X_test, y_test, reference=mat_train)
+    return mat_train, mat_test
+
+
+@step
+def trainer(
+    config: LightGBMConfig, mat_train: lgb.Dataset, mat_test: lgb.Dataset
+) -> lgb.Booster:
+    """Trains a LightGBM model on the data."""
+    params = {
+        "boosting_type": config.boosting_type,
+        "objective": config.objective,
+        "num_leaves": config.num_leaves,
+        "learning_rate": config.learning_rate,
+        "feature_fraction": config.feature_fraction,
+        "bagging_fraction": config.bagging_fraction,
+        "bagging_freq": config.bagging_freq,
+        "verbose": config.verbose,
+    }
+    gbm = lgb.train(
+        params,
+        mat_train,
+        num_boost_round=20,
+        valid_sets=mat_test,
+        callbacks=[lgb.early_stopping(stopping_rounds=5)],
+    )
+    return gbm
+
+
+@step
+def predictor(model: lgb.Booster, mat: lgb.Dataset) -> np.ndarray:
+    """Makes predictions on a trained LightGBM booster model."""
+    return model.predict(np.random.rand(7, 28))
+
+
+@pipeline(enable_cache=False, required_integrations=[LIGHTGBM])
+def lgbm_pipeline(
+    data_loader,
+    trainer,
+    predictor,
+):
+    """Links all the steps together in a pipeline"""
+    mat_train, mat_test = data_loader()
+    model = trainer(mat_train, mat_test)
+    predictor(model, mat_train)
+
+
+if __name__ == "__main__":
+
+    pipeline = lgbm_pipeline(
+        data_loader=data_loader(), trainer=trainer(), predictor=predictor()
+    )
+    pipeline.run()
diff --git a/examples/lightgbm/setup.sh b/examples/lightgbm/setup.sh
@@ -0,0 +1,11 @@
+#!/usr/bin/env bash
+
+set -Eeo pipefail
+
+pre_run () {
+  zenml integration install lightgbm
+}
+
+pre_run_forced () {
+  zenml integration install lightgbm -f
+}
diff --git a/pyproject.toml b/pyproject.toml
@@ -229,6 +229,7 @@ module = [
     "google.*",
     "neuralprophet.*",
     "wandb.*",
+    "lightgbm.*",
 ]
 ignore_missing_imports = true
 

diff --git a/src/zenml/integrations/__init__.py b/src/zenml/integrations/__init__.py
@@ -29,6 +29,7 @@
 from zenml.integrations.graphviz import GraphvizIntegration  # noqa
 from zenml.integrations.huggingface import HuggingfaceIntegration  # noqa
 from zenml.integrations.kubeflow import KubeflowIntegration  # noqa
+from zenml.integrations.lightgbm import LightGBMIntegration  # noqa
 from zenml.integrations.mlflow import MlflowIntegration  # noqa
 from zenml.integrations.neural_prophet import NeuralProphetIntegration  # noqa
 from zenml.integrations.plotly import PlotlyIntegration  # noqa

diff --git a/src/zenml/integrations/constants.py b/src/zenml/integrations/constants.py
@@ -39,3 +39,4 @@
 NEURAL_PROPHET = "neural_prophet"
 HUGGINGFACE = "huggingface"
 XGBOOST = "xgboost"
+LIGHTGBM = "lightgbm"
diff --git a/src/zenml/integrations/lightgbm/__init__.py b/src/zenml/integrations/lightgbm/__init__.py
@@ -0,0 +1,30 @@
+#  Copyright (c) ZenML GmbH 2021. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       https://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+from zenml.integrations.constants import LIGHTGBM
+from zenml.integrations.integration import Integration
+
+
+class LightGBMIntegration(Integration):
+    """Definition of lightgbm integration for ZenML."""
+
+    NAME = LIGHTGBM
+    REQUIREMENTS = ["lightgbm>=1.0.0"]
+
+    @classmethod
+    def activate(cls) -> None:
+        """Activates the integration."""
+        from zenml.integrations.lightgbm import materializers  # noqa
+
+
+LightGBMIntegration.check_installation()
diff --git a/src/zenml/integrations/lightgbm/materializers/__init__.py b/src/zenml/integrations/lightgbm/materializers/__init__.py
@@ -0,0 +1,19 @@
+#  Copyright (c) ZenML GmbH 2021. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       https://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+from zenml.integrations.lightgbm.materializers.lightgbm_booster_materializer import (  # noqa
+    LightGBMBoosterMaterializer,
+)
+from zenml.integrations.lightgbm.materializers.lightgbm_dataset_materializer import (  # noqa
+    LightGBMDatasetMaterializer,
+)
diff --git a/src/zenml/integrations/lightgbm/materializers/lightgbm_booster_materializer.py b/src/zenml/integrations/lightgbm/materializers/lightgbm_booster_materializer.py
@@ -0,0 +1,67 @@
+#  Copyright (c) ZenML GmbH 2021. All Rights Reserved.
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at:
+#
+#       https://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+#  or implied. See the License for the specific language governing
+#  permissions and limitations under the License.
+
+import os
+import tempfile
+from typing import Any, Type
+
+import lightgbm as lgb
+
+from zenml.artifacts import ModelArtifact
+from zenml.io import fileio
+from zenml.materializers.base_materializer import BaseMaterializer
+
+DEFAULT_FILENAME = "model.txt"
+
+
+class LightGBMBoosterMaterializer(BaseMaterializer):
+    """Materializer to read data to and from lightgbm.Booster."""
+
+    ASSOCIATED_TYPES = (lgb.Booster,)
+    ASSOCIATED_ARTIFACT_TYPES = (ModelArtifact,)
+
+    def handle_input(self, data_type: Type[Any]) -> lgb.Booster:
+        """Reads a lightgbm Booster model from a serialized JSON file."""
+        super().handle_input(data_type)
+        filepath = os.path.join(self.artifact.uri, DEFAULT_FILENAME)
+
+        # Create a temporary folder
+        temp_dir = tempfile.mkdtemp(prefix="zenml-temp-")
+        temp_file = os.path.join(str(temp_dir), DEFAULT_FILENAME)
+
+        # Copy from artifact store to temporary file
+        fileio.copy(filepath, temp_file)
+        booster = lgb.Booster(model_file=temp_file)
+
+        # Cleanup and return
+        fileio.rmtree(temp_dir)
+        return booster
+
+    def handle_return(self, booster: lgb.Booster) -> None:
+        """Creates a JSON serialization for a lightgbm Booster model.
+
+        Args:
+            booster: A lightgbm Booster model.
+        """
+        super().handle_return(booster)
+
+        filepath = os.path.join(self.artifact.uri, DEFAULT_FILENAME)
+
+        # Make a temporary phantom artifact
+        with tempfile.NamedTemporaryFile(
+            mode="w", suffix=".txt", delete=True
+        ) as f:
+            booster.save_model(f.name)
+            # Copy it into artifact store
+            fileio.copy(f.name, filepath)