# Mlrun Job Runtime Remote Artifacts

In [1]:
!pip install xgboost~=1.7.5



In [2]:
import mlrun
import os
import random

# Initialize the MLRun project object
project = mlrun.get_or_create_project('remote-artifacts',user_project=True,context='./')

# Required credentials :
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, GOOGLE_APPLICATION_CREDENTIALS, S3_BUCKET
AWS_ACCESS_KEY_ID = os.environ.get('AWS_ACCESS_KEY_ID', None)
AWS_SECRET_ACCESS_KEY = os.environ.get('AWS_SECRET_ACCESS_KEY', None)
GOOGLE_APPLICATION_CREDENTIALS = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS', None)

# Setting as env variables
os.environ['AWS_ACCESS_KEY_ID'] = AWS_ACCESS_KEY_ID
os.environ['AWS_SECRET_ACCESS_KEY'] = AWS_SECRET_ACCESS_KEY
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = GOOGLE_APPLICATION_CREDENTIALS
assert AWS_ACCESS_KEY_ID != None and AWS_SECRET_ACCESS_KEY != None and GOOGLE_APPLICATION_CREDENTIALS != None, "please provide credentials"

secrets = {'AWS_ACCESS_KEY_ID': AWS_ACCESS_KEY_ID,
           'AWS_SECRET_ACCESS_KEY':AWS_SECRET_ACCESS_KEY,
           'GOOGLE_APPLICATION_CREDENTIALS': GOOGLE_APPLICATION_CREDENTIALS}

project.set_secrets(secrets=secrets, provider='kubernetes')

S3_BUCKET = os.environ.get('S3_BUCKET', 'testbucket-igz-temp')

project.artifact_path = os.path.join('s3://', S3_BUCKET + '/remote-artifacts/'+ str(random.randint(0,10000)))

> 2023-03-06 07:36:39,998 [info] loaded project remote-artifacts from MLRun DB


In [3]:
#mlrun: start-code

import mlrun
import pandas as pd 
import json
import os
from xgboost import XGBClassifier
import pickle
from mlrun.artifacts.base import DirArtifact
from mlrun import MLClientCtx
from sklearn.datasets import load_iris
from io import BytesIO
from sklearn.model_selection import train_test_split

def get_dataitem(context: MLClientCtx,
                             key: str):
    
    for artifact in context.artifacts:
        if artifact['kind'] == 'model' and artifact['metadata'].get('key',None) == key:
            return mlrun.get_dataitem(artifact['spec']['target_path'] + artifact['spec']['model_file'])
        elif artifact['kind'] == 'dataset' and artifact['metadata'].get('key',None) == key:
            return mlrun.get_dataitem(artifact['spec']['target_path'])
        elif artifact['metadata'].get('key',None) == key:
            return mlrun.get_dataitem(artifact['spec']['target_path'])
    context.logger.info('Artifact not found')
    
def log_transactions(context: MLClientCtx):

    # uploading new artifact 
    df_encode = pd.DataFrame(load_iris()['data']).to_json().encode()
    context.log_artifact('encoded_iris-'+context.artifact_path[:2], body=df_encode, local_path='encoded_iris-'+context.artifact_path[:2]+'.csv')
    # reading artifact
    trans_df = pd.DataFrame(json.loads(get_dataitem(context, 'encoded_iris-'+context.artifact_path[:2]).get()))
    context.logger.info(f'encoded dataframe shape : {trans_df.shape}')
    
    # training the model (for serving purposes )
    clf = XGBClassifier(n_estimators=2, max_depth=2, learning_rate=1, objective='binary:logistic')
    X,y = load_iris().data, load_iris().target
    X_train, x_test, y_train, y_test = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
    clf.fit(X_train, y_train)
    # logging a model
    context.log_model('clf_model', body=pickle.dumps(clf), model_file='clf.pkl')
    # getting the model remote
    model = pickle.loads(get_dataitem(context, 'clf_model').get())
    context.logger.info(f'logged model : {model.__class__}')
    
    # Logging directory
    context.log_artifact(DirArtifact(key='my_project', target_path=context.artifact_path))
    
    # Logging dataset 
    context.log_dataset(key = 'iris_dataset-'+context.artifact_path[:2],
                        df = pd.DataFrame(json.loads(get_dataitem(context, 'encoded_iris-'+context.artifact_path[:2]).get())),
                        local_path='iris_dataset-'+context.artifact_path[:2]+'.csv')
    # Getting dataset
    context.logger.info(f'logged dataset shape {get_dataitem(context, "iris_dataset-" + context.artifact_path[:2]).as_df().shape}')
    
    return
            
#mlrun: end-code

## Mlrun job using s3 and gs local and remote

In [4]:
project.set_function(name='log_transactions', kind='job', image='mlrun/mlrun', handler='log_transactions')
project.get_function('log_transactions').apply(mlrun.platforms.mount_s3())

task = mlrun.new_task().with_secrets("kubernetes", ["AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY", "GOOGLE_APPLICATION_CREDENTIALS"])

project.get_function('log_transactions').run(task, 'log_transactions', local=True)

> 2023-03-06 07:37:08,945 [info] starting run log-transactions-log-transactions uid=30379c65dc7441e286e0a566825216ca DB=http://mlrun-api:8080
> 2023-03-06 07:37:19,471 [info] encoded dataframe shape : (150, 4)
> 2023-03-06 07:37:20,489 [info] logged model : <class 'xgboost.sklearn.XGBClassifier'>
> 2023-03-06 07:37:33,001 [info] logged dataset shape (150, 5)


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
remote-artifacts-normal-user,...825216ca,0,Mar 06 07:37:09,completed,log-transactions-log-transactions,v3io_user=normal-userkind=owner=normal-userhost=jupyter-v5afxg4hcf-at3sd-77c89bbcd5-svv9f,,,,encoded_iris-s3clf_modelmy_projectiris_dataset-s3





> 2023-03-06 07:37:33,213 [info] run executed, status=completed


<mlrun.model.RunObject at 0x7f656862a710>

In [5]:
project.artifact_path = 'gs' + project.artifact_path[2:] # Switching to GCS
project.get_function('log_transactions').run(task, 'log_transactions', local=True)

> 2023-03-06 07:37:33,230 [info] starting run log-transactions-log-transactions uid=6e6b7801372e4fb289f76dfc5e768e97 DB=http://mlrun-api:8080
> 2023-03-06 07:37:35,909 [info] encoded dataframe shape : (150, 4)
> 2023-03-06 07:37:38,337 [info] logged model : <class 'xgboost.sklearn.XGBClassifier'>
> 2023-03-06 07:37:41,006 [info] logged dataset shape (150, 5)


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
remote-artifacts-normal-user,...5e768e97,0,Mar 06 07:37:33,completed,log-transactions-log-transactions,v3io_user=normal-userkind=owner=normal-userhost=jupyter-v5afxg4hcf-at3sd-77c89bbcd5-svv9f,,,,encoded_iris-gsclf_modelmy_projectiris_dataset-gs





> 2023-03-06 07:37:41,223 [info] run executed, status=completed


<mlrun.model.RunObject at 0x7f656862a7d0>

In [6]:
project.save()

<mlrun.projects.project.MlrunProject at 0x7f6583671b90>