In [1]:
import mlrun
import os

# Initialize the MLRun project object
project = mlrun.get_or_create_project('remote-artifacts',user_project=True,context='./')

# Required credentials :
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, GOOGLE_APPLICATION_CREDENTIALS, S3_BUCKET
AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', None)
AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', None)
GOOGLE_APPLICATION_CREDENTIALS = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS', None)
assert AWS_ACCESS_KEY_ID != None and AWS_SECRET_ACCESS_KEY != None and GOOGLE_APPLICATION_CREDENTIALS != None, "please provide credentials"

secrets = {'AWS_ACCESS_KEY_ID': AWS_ACCESS_KEY_ID,
           'AWS_SECRET_ACCESS_KEY':AWS_SECRET_ACCESS_KEY,
           'GOOGLE_APPLICATION_CREDENTIALS': GOOGLE_APPLICATION_CREDENTIALS}

project.set_secrets(secrets=secrets, provider='kubernetes')

S3_BUCKET = os.environ.get('S3_BUCKET', 'testbucket-igz')

project.artifact_path = os.path.join('s3://', S3_BUCKET + '/remote-artifacts/')

> 2023-01-11 08:50:44,538 [info] loaded project remote-artifacts from MLRun DB


In [2]:
#mlrun: start-code
from pyspark.sql import SparkSession
import mlrun
import os
import pandas as pd

def get_dataitem(context: mlrun.MLClientCtx,
                             key: str):
    
    for artifact in context.artifacts:
        if artifact['kind'] == 'model' and artifact['metadata'].get('key',None) == key:
            return mlrun.get_dataitem(artifact['spec']['target_path'] + artifact['spec']['model_file'])
        elif artifact['kind'] == 'dataset' and artifact['metadata'].get('key',None) == key:
            return mlrun.get_dataitem(artifact['spec']['target_path'])
        elif artifact['metadata'].get('key',None) == key:
            return mlrun.get_dataitem(artifact['spec']['target_path'])
    context.logger.info('Artifact not found')

def spark_func(context: mlrun.MLClientCtx,
               artifact_target_path: str):
        
    spark = SparkSession \
        .builder \
        .appName("Python Spark SQL basic example") \
        .config("spark.some.config.option", "some-value") \
        .config("fs.s3a.access.key", context.get_secret('AWS_ACCESS_KEY_ID'))\
        .config("fs.s3a.secret.key", context.get_secret('AWS_SECRET_ACCESS_KEY'))\
        .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")\
        .config("com.amazonaws.services.s3.enableV4", True)\
        .config("spark.driver.extraJavaOptions", "-Dcom.amazonaws.services.s3.enableV4=true")\
        .getOrCreate()
        
    context.log_artifact('spark_s3_artifact', local_path=artifact_target_path)
    context.logger.info('artifact logged')
    
    df = get_dataitem(context, 'spark_s3_artifact').as_df()
    context.logger.info('getting logged artifact with custom function')
    sparkDF = spark.createDataFrame(df)
    sparkDF.show()
    sparkDF.write.format('csv').save(artifact_target_path)
    
#mlrun: end-code

In [4]:
# For Spark operator
from mlrun.runtimes import Spark3Runtime
Spark3Runtime.deploy_default_image()

In [5]:
sj = mlrun.code_to_function(name='spark_func', 
                            kind='spark', 
                            image='.spark-job-default-image',
                            handler='spark_func')

# set spark driver config (gpu_type & gpus=<number_of_gpus>  supported too)
sj.with_driver_limits(cpu="500m")
sj.with_driver_requests(cpu=1, mem="512m") 

# set spark executor config (gpu_type & gpus=<number_of_gpus> are supported too)
sj.with_executor_limits(cpu="500m")
sj.with_executor_requests(cpu=1, mem="512m")

# adds fuse, daemon & iguazio's jars support
sj.with_igz_spark() 

project.set_function(name = 'spark_func', func=sj)
project.get_function('spark_func').apply(mlrun.platforms.mount_s3())

task = mlrun.new_task().with_secrets("kubernetes", ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "GOOGLE_APPLICATION_CREDENTIALS"])



In [6]:
project.get_artifact('log-transactions-log_transactions_iris_dataset-s3').target_path

's3://testbucket-igz/remote-artifacts/log-transactions-log_transactions/0/iris_dataset-s3.csv'

In [7]:
project.get_function('spark_func').run(task,
                                       inputs = {'artifact_target_path': project.get_artifact('log-transactions-log_transactions_iris_dataset-s3').target_path},
                                       local=True)

> 2023-01-11 08:51:02,937 [info] starting run spark-func-spark_func uid=1f88dd1cf5144ff6897fd6f3dd405a52 DB=http://mlrun-api:8080
> 2023-01-11 08:51:05,621 [info] downloading s3://testbucket-igz/remote-artifacts/log-transactions-log_transactions/0/iris_dataset-s3.csv to local temp file
> 2023-01-11 08:51:13,084 [info] artifact logged
> 2023-01-11 08:51:18,676 [info] getting logged artifact with custom function
+----------+---+---+---+---+
|Unnamed: 0|  0|  1|  2|  3|
+----------+---+---+---+---+
|         0|5.1|3.5|1.4|0.2|
|         1|4.9|3.0|1.4|0.2|
|         2|4.7|3.2|1.3|0.2|
|         3|4.6|3.1|1.5|0.2|
|         4|5.0|3.6|1.4|0.2|
|         5|5.4|3.9|1.7|0.4|
|         6|4.6|3.4|1.4|0.3|
|         7|5.0|3.4|1.5|0.2|
|         8|4.4|2.9|1.4|0.2|
|         9|4.9|3.1|1.5|0.1|
|        10|5.4|3.7|1.5|0.2|
|        11|4.8|3.4|1.6|0.2|
|        12|4.8|3.0|1.4|0.1|
|        13|4.3|3.0|1.1|0.1|
|        14|5.8|4.0|1.2|0.2|
|        15|5.7|4.4|1.5|0.4|
|        16|5.4|3.9|1.3|0.4|
|     

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
remote-artifacts-admin,...dd405a52,0,Jan 11 08:51:05,completed,spark-func-spark_func,v3io_user=adminkind=owner=adminhost=jupyter-6479d4cd8c-rnxrj,artifact_target_path,,,spark_s3_artifact





> 2023-01-11 08:51:26,608 [info] run executed, status=completed


<mlrun.model.RunObject at 0x7fa4e5cf1fd0>

## Cleaup

In [9]:
import boto3
from urllib.parse import urlparse


s3 = boto3.resource('s3')
bucket = s3.Bucket(urlparse(project.artifact_path).netloc)
bucket.objects.filter(Prefix=urlparse(project.artifact_path).path[1:]).delete()
mlrun.get_run_db().delete_project(name=project.name, deletion_strategy='cascade')