In [None]:
import mlrun
import os

# Initialize the MLRun project object
project = mlrun.get_or_create_project('remote-artifacts',user_project=True,context='./')

# Required credentials :
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, GOOGLE_APPLICATION_CREDENTIALS, S3_BUCKET
AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', None)
AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', None)
GOOGLE_APPLICATION_CREDENTIALS = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS', None)

# Setting as env variables
os.environ['AWS_ACCESS_KEY_ID'] = AWS_ACCESS_KEY_ID
os.environ['AWS_SECRET_ACCESS_KEY'] = AWS_SECRET_ACCESS_KEY
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = GOOGLE_APPLICATION_CREDENTIALS
assert AWS_ACCESS_KEY_ID != None and AWS_SECRET_ACCESS_KEY != None and GOOGLE_APPLICATION_CREDENTIALS != None, "please provide credentials"

secrets = {'AWS_ACCESS_KEY_ID': AWS_ACCESS_KEY_ID,
           'AWS_SECRET_ACCESS_KEY':AWS_SECRET_ACCESS_KEY,
           'GOOGLE_APPLICATION_CREDENTIALS': GOOGLE_APPLICATION_CREDENTIALS}

project.set_secrets(secrets=secrets, provider='kubernetes')

S3_BUCKET = os.environ.get('S3_BUCKET', 'testbucket-igz-temp')

assert project.artifact_path != None, 'please run 01_mlrun_job.ipynb'

In [None]:
#mlrun: start-code
from pyspark.sql import SparkSession
import mlrun
import os
import pandas as pd

from packaging.version import Version
if Version(pd.__version__) > Version("2.0.0"):
    pd.DataFrame.iteritems = pd.DataFrame.items


def get_dataitem(context: mlrun.MLClientCtx,
                             key: str):
    
    for artifact in context.artifacts:
        if artifact['kind'] == 'model' and artifact['metadata'].get('key',None) == key:
            return mlrun.get_dataitem(artifact['spec']['target_path'] + artifact['spec']['model_file'])
        elif artifact['kind'] == 'dataset' and artifact['metadata'].get('key',None) == key:
            return mlrun.get_dataitem(artifact['spec']['target_path'])
        elif artifact['metadata'].get('key',None) == key:
            return mlrun.get_dataitem(artifact['spec']['target_path'])
    context.logger.info('Artifact not found')

def spark_func(context: mlrun.MLClientCtx,
               artifact_target_path: str):
        
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .config("fs.s3a.access.key", context.get_secret('AWS_ACCESS_KEY_ID'))\
        .config("fs.s3a.secret.key", context.get_secret('AWS_SECRET_ACCESS_KEY'))\
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
        .config("com.amazonaws.services.s3.enableV4", True)\
        .config("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")\
        .getOrCreate()
        
    context.log_artifact('spark_s3_artifact', local_path=artifact_target_path)
    context.logger.info('artifact logged')
    
    df = get_dataitem(context, 'spark_s3_artifact').as_df()
    context.logger.info('getting logged artifact with custom function')
    sparkDF = spark.createDataFrame(df)
    sparkDF.show()
    sparkDF.write.format('csv').save(artifact_target_path)
    
#mlrun: end-code

In [None]:
# For Spark operator
from mlrun.runtimes import Spark3Runtime
Spark3Runtime.deploy_default_image()

In [None]:
sj = mlrun.code_to_function(name='spark_func', 
                            kind='spark', 
                            image='.spark-job-default-image',
                            handler='spark_func')

# set spark driver config (gpu_type & gpus=<number_of_gpus>  supported too)
sj.with_driver_limits(cpu="500m")
sj.with_driver_requests(cpu=1, mem="512m") 

# set spark executor config (gpu_type & gpus=<number_of_gpus> are supported too)
sj.with_executor_limits(cpu="500m")
sj.with_executor_requests(cpu=1, mem="512m")

# adds fuse, daemon & iguazio's jars support
sj.with_igz_spark() 

project.set_function(name = 'spark_func', func=sj)
project.get_function('spark_func').apply(mlrun.platforms.mount_s3())

task = mlrun.new_task().with_secrets("kubernetes", ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "GOOGLE_APPLICATION_CREDENTIALS"])

In [None]:
project.get_artifact('log-transactions-log-transactions_iris_dataset-s3').target_path

In [None]:
project.get_function('spark_func').run(task,
                                       inputs = {'artifact_target_path': project.get_artifact('log-transactions-log-transactions_iris_dataset-s3').target_path},
                                       local=True)

## Cleaup

In [None]:
import boto3
from urllib.parse import urlparse


s3 = boto3.resource('s3')
bucket = s3.Bucket(urlparse(project.artifact_path).netloc)
bucket.objects.filter(Prefix=urlparse(project.artifact_path).path[1:]).delete()
mlrun.get_run_db().delete_project(name=project.name, deletion_strategy='cascade')

In [None]:
import shutil
for f in os.listdir():
    if (not f.endswith('ipynb')) and f != '.test':
        if os.path.isfile(f):
            os.remove(f)
        elif os.path.isdir(f):
            shutil.rmtree(f)
        else:
            raise "A file that is not a notebook wasn't deleted"