# Running Presto SQL Queries from MLRun Job

This notebook demonstrates how to create and test an MLRun job that reads data via Presto


In [1]:
!/User/align_mlrun.sh

Both server & client are aligned (0.6.0rc11).


#### Copy the secret files neeeded for Presto to local v3io directory, which will be later mounted
#### inside the mlrun job image


In [2]:
!mkdir -p /v3io/${V3IO_HOME}/secrets
!cp /var/run/iguazio/secrets/* /v3io/${V3IO_HOME}/secrets

### Mlrun job that reads from table using sqlalchemy

In [3]:
# nuclio: start-code

In [4]:
import pandas as pd 
import os 
from sqlalchemy.engine import create_engine 
import pyhive

def read_from_presto(context, table_path='', filter='BAYN'):
                   
    # DATABASE_URL contains the Presto URL, as well as access key and location of secrets
    engine = create_engine (os.getenv('DATABASE_URL'))

    # note - make sure to create and popultate the stocks_tab table in advance - (check the getting started section in the collect-n-explore notebook)
    query = 'select min(endprice) min_endprice,max(endprice) max_endprice from '+table_path+"  where mnemonic = '"+filter+"'"
    context.logger.info(query)

    df = pd.read_sql(query,engine)
    
    # Store the data set in your artifacts database
    context.log_dataset('table_content', df=df)


In [5]:
# nuclio: end-code

### Test locally 

In [6]:
from mlrun import code_to_function,mount_v3io

# Convert the local get_data function into a gen_func project function
read_presto_func = code_to_function(name='read-from-presto',kind='job')


##### Define table name and filter. 

##### Note - make sure to create and popultate the stocks_tab table in advance - (check the getting started section in the collect-n-explore notebook)


In [7]:
table_path = f'v3io.users."{os.getenv("V3IO_USERNAME")}/examples/stocks/stocks_tab"'
filter = 'BAYN'

In [8]:
read_presto_run = read_presto_func.run(name='read_from_presto',
                                 handler='read_from_presto',
                                 inputs={'table_path': table_path,
                                        'filter':filter},
                                 local=True)

> 2021-01-27 10:15:20,437 [info] starting run read_from_presto uid=421695a177324b7c8243e22ded0d77d3 DB=http://mlrun-api:8080
> 2021-01-27 10:15:20,737 [info] select min(endprice) min_endprice,max(endprice) max_endprice from v3io.users."nir/examples/stocks/stocks_tab"  where mnemonic = 'BAYN'


project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
default,...ed0d77d3,0,Jan 27 10:15:20,completed,read_from_presto,v3io_user=nirkind=owner=nirhost=jupyter-nir-7b7f98977b-mdrsf,table_pathfilter,,,table_content


to track results use .show() or .logs() or in CLI: 
!mlrun get run 421695a177324b7c8243e22ded0d77d3 --project default , !mlrun logs 421695a177324b7c8243e22ded0d77d3 --project default
> 2021-01-27 10:15:24,112 [info] run executed, status=completed


#### Build the image needed to run job on the cluster
#### Requires to pip install the required libraries and set the needed environment environment 

In [9]:
read_presto_func.spec.build.commands=["python -m pip install git+https://github.com/v3io/PyHive.git@v0.6.999",
                                     "python -m pip install sqlalchemy==1.3.14"]
read_presto_func.set_envs ({'DATABASE_URL':os.getenv('DATABASE_URL')})
#read_presto_func.spec.build.baseImage = ".mlrun/func-test1-read-from-presto-latest"

<mlrun.runtimes.kubejob.KubejobRuntime at 0x7f8253966b90>

In [None]:
read_presto_func.deploy()

> 2021-01-27 10:15:24,130 [info] starting remote build, image: .mlrun/func-default-read-from-presto-latest
[36mINFO[0m[0020] Retrieving image manifest mlrun/mlrun:0.6.0-rc11 
[36mINFO[0m[0020] Retrieving image manifest mlrun/mlrun:0.6.0-rc11 
[36mINFO[0m[0021] Built cross stage deps: map[]                
[36mINFO[0m[0021] Retrieving image manifest mlrun/mlrun:0.6.0-rc11 
[36mINFO[0m[0021] Retrieving image manifest mlrun/mlrun:0.6.0-rc11 
[36mINFO[0m[0021] Executing 0 build triggers                   
[36mINFO[0m[0021] Unpacking rootfs as cmd RUN python -m pip install git+https://github.com/v3io/PyHive.git@v0.6.999 requires it. 
[36mINFO[0m[0035] RUN python -m pip install git+https://github.com/v3io/PyHive.git@v0.6.999 
[36mINFO[0m[0035] Taking snapshot of full filesystem...        


### Run the job in the cluster

#### Mount secrets as /var/run/iguazio/secrets - this is the location where DATABASE_URL is configured to read from


In [None]:
read_presto_func.apply(mount_v3io(remote='~/secrets', mount_path='/var/run/iguazio/secrets'))
read_presto_run = read_presto_func.run(name='read_from_presto',
                                 handler='read_from_presto',
                                 inputs={'table_path': 'v3io.users."nir/examples/stocks/stocks_tab"',
                                        'mnemonic':'BAYN'},
                                 local=False)