In [None]:
import mlrun
import os

# Initialize the MLRun project object
project = mlrun.load_project('./')

# Required credentials :
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, GOOGLE_APPLICATION_CREDENTIALS as jupyter env vars and project secrets (k8s provider).

In [None]:
#mlrun: start-code
from pyspark.sql import SparkSession
import mlrun
import os

def spark_func(context: mlrun.MLClientCtx):
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .config("fs.s3a.access.key", context.get_secret('AWS_ACCESS_KEY_ID'))\
        .config("fs.s3a.secret.key", context.get_secret('AWS_SECRET_ACCESS_KEY'))\
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
        .config("com.amazonaws.services.s3.enableV4", True)\
        .config("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")\
        .getOrCreate()
        
    sparkDF = spark.read.format("csv").load(context.artifact_path + 'transactions_cut.csv')
    sparkDF.show()
    
    sparkDF.write.option("header","true").parquet(context.artifact_path + 'transactions_cut.parquet')
    
#mlrun: end-code

In [None]:
sj = mlrun.code_to_function(name='spark_func', 
                            kind='spark', 
                            image='docker-registry.default-tenant.app.jmglmvqnganv.iguazio-cd1.com/mlrun/func-test-notebooks-dani-spark-func:latest', 
                            handler='spark_func')

# image = 'image to docker-registry.default-tenant.app.jmglmvqnganv.iguazio-cd1.com/mlrun/func-test-notebooks-dani-spark-func:latest'
# set spark driver config (gpu_type & gpus=<number_of_gpus>  supported too)
sj.with_driver_limits(cpu="1300m")
sj.with_driver_requests(cpu=1, mem="512m") 

# set spark executor config (gpu_type & gpus=<number_of_gpus> are supported too)
sj.with_executor_limits(cpu="1400m")
sj.with_executor_requests(cpu=1, mem="512m")

# adds fuse, daemon & iguazio's jars support
sj.with_igz_spark() 

project.set_function(name = 'spark_func', func=sj)
project.get_function('spark_func').apply(mlrun.platforms.mount_s3())

task = mlrun.new_task().with_secrets("kubernetes", ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "GOOGLE_APPLICATION_CREDENTIALS"])

project.get_function('spark_func').run(task,artifact_path = 's3a' + project.artifact_path[2:])