In [1]:
!pip install kfp[kubernetes]

Collecting protobuf<5,>=4.21.1 (from kfp[kubernetes])
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting kfp-kubernetes<2 (from kfp[kubernetes])
  Downloading kfp-kubernetes-1.5.0.tar.gz (16 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
Building wheels for collected packages: kfp-kubernetes
[33m  DEPRECATION: Building 'kfp-kubernetes' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'kfp-kubernetes'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
  Building wheel for kfp-kubernetes (setup.py) ... [?done
[?25h  Created wheel for kfp-kubernetes

In [2]:
import kfp 

from kfp.dsl import component, pipeline
from kfp import kubernetes

In [3]:
# This is the same as below, but some of the logic put in the spark8t library
# @component(
#     base_image="docker.io/bikalpadhakalcanonical/charmed-spark:5",
#     packages_to_install=["pyspark==3.4.2"]
# )
# def spark_test_component() -> None:
#     import logging
#     from operator import add
#     from spark8t.session import SparkSession
         
#     def count_vowels(text: str) -> int:
#       count = 0
#       for char in text:
#         if char.lower() in "aeiou":
#           count += 1
#       return count

#     lines = """Canonical's Charmed Data Platform solution for Apache Spark runs Spark jobs on your Kubernetes cluster.
#     You can get started right away with MicroK8s - the mightiest tiny Kubernetes distro around! 
#     The spark-client snap simplifies the setup process to run Spark jobs against your Kubernetes cluster. 
#     Spark on Kubernetes is a complex environment with many moving parts.
#     Sometimes, small mistakes can take a lot of time to debug and figure out.
#     """
    
#     with SparkSession(app_name="CountVowels", namespace="admin", username="spark") as spark:
#         n = spark.sparkContext.parallelize(lines.splitlines(), 2).map(count_vowels).reduce(add)
#         logging.warning(f"The number of vowels in the string is {n}")


@component(
    base_image="ghcr.io/canonical/charmed-spark:3.5-22.04_edge",
)
def spark_test_component() -> None:
    import logging
    import os
    import pyspark
    import socket
    from lightkube import Client
    from operator import add
    from spark8t.services import K8sServiceAccountRegistry
    from spark8t.services import LightKube as LightKubeInterface
    
    def count_vowels(text: str) -> int:
      count = 0
      for char in text:
        if char.lower() in "aeiou":
          count += 1
      return count

    lines = """Canonical's Charmed Data Platform solution for Apache Spark runs Spark jobs on your Kubernetes cluster.
    You can get started right away with MicroK8s - the mightiest tiny Kubernetes distro around! 
    The spark-client snap simplifies the setup process to run Spark jobs against your Kubernetes cluster. 
    Spark on Kubernetes is a complex environment with many moving parts.
    Sometimes, small mistakes can take a lot of time to debug and figure out.
    """

    app_name = "CountVowels"
    SPARK_SERVICE_ACCOUNT = os.environ["SPARK_SERVICE_ACCOUNT"]
    SPARK_NAMESPACE = os.environ["SPARK_NAMESPACE"]

    pod_ip = socket.gethostbyname(socket.gethostname())
    k8s_master = Client().config.cluster.server
    interface = LightKubeInterface(None, None)
    registry = K8sServiceAccountRegistry(interface)
    
    spark_properties = registry.get(
        f"{SPARK_NAMESPACE}:{SPARK_SERVICE_ACCOUNT}"
    ).configurations.props | {
        "spark.driver.host": pod_ip,
    }

    builder = pyspark.sql.SparkSession\
                    .builder\
                    .appName(app_name)\
                    .master(f"k8s://{k8s_master}")
    for conf, val in spark_properties.items():
        builder = builder.config(conf, val)
    session = builder.getOrCreate()

    n = session.sparkContext.parallelize(lines.splitlines(), 2).map(count_vowels).reduce(add)
    logging.warning(f"The number of vowels in the string is {n}")


In [4]:
@pipeline(name="spark-test-pipeline")
def spark_pipeline():
    task = spark_test_component()
    kubernetes.add_pod_label(
        task,
        label_key='access-spark-pipeline',
        label_value='true',
    )
    # kubernetes.add_pod_annotation(
    #     task,
    #     annotation_key='traffic.sidecar.istio.io/excludeInboundPorts',
    #     annotation_value='37371,6060',
    # )
    # kubernetes.add_pod_annotation(
    #     task,
    #     annotation_key='traffic.sidecar.istio.io/excludeOutboundPorts',
    #     annotation_value='37371,6060',
    # )

In [5]:
client=kfp.Client()
kfp.compiler.Compiler().compile(
    spark_pipeline,
    package_path="spark_test_pipeline.yaml"
)
run = client.create_run_from_pipeline_func(
    spark_pipeline,
    arguments={},
    enable_caching=False
)

