## Test Submit a MLRUN job that will run on Databricks Spark

In [1]:
import os
import pandas as pd
import mlrun
from mlrun.datastore.sources import ParquetSource
print(mlrun.__version__)

1.5.1


### Copy the template.env to test.env and edit your test.env to fill in the env info

In [2]:
from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv("test.env"))

### Define fine path

In [3]:
code_path='./get_stats.py'
dbfs_data_path = "dbfs:///music_demo_1/music_data.parquet"
dbfs_stats_path = "dbfs:///music_demo_1/music_stats.parquet"

### Create a project and set the project secrets

In [4]:
# setup project and secrets
project = mlrun.get_or_create_project(
    "databricks-proj"
)

> 2023-11-30 18:25:27,714 [info] Project loaded successfully: {'project_name': 'databricks-proj'}


In [5]:
project.set_secrets(file_path="./.env")

In [6]:
job_env = {
    "DATABRICKS_HOST": os.environ["DATABRICKS_HOST"],
    "DATABRICKS_CLUSTER_ID": os.environ.get("DATABRICKS_CLUSTER_ID")
}

### Mock up some data

In [7]:
# create df
columns = ["id", "name", "age", "favorite_music_type"]
data = [
    (1, "Alice", 20, "Pop"),
    (2, "Bob", 30, "Rock"),
    (3, "Charlie", 25, "Pop"),
    (4, "David", 40, "Classical"),
    (5, "Eva", 18, "Pop"),
    (6, "Frank", 32, "Rock"),
    (7, "Grace", 28, "Pop"),
    (8, "Henry", 45, "Classical"),
    (9, "Ivy", 22, "Pop"),
    (10, "Jack", 38, "Classical"),
    (11, "Karen", 27, "Pop"),
    (12, "Liam", 19, "Pop"),
    (13, "Mia", 27, "Rock"),
    (14, "Nora", 31, "Rock"),
    (15, "Oliver", 29, "Pop"),
    (16, "Ben", 38, "Pop")
]
df = pd.DataFrame(data, columns=columns)

### Create a MLRUN function

In [8]:
%%writefile get_stats.py
#  Here is an example of Spark processing.
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, min, max
import pandas as pd
import json

def main(data: str, data_output_path: str, stats_output_path: str):
    spark = SparkSession.builder.appName("MusicDemo").getOrCreate()
    json_dict = json.loads(data)
    pandas_df = pd.DataFrame(json_dict)
    spark_df = spark.createDataFrame(pandas_df)
    spark_df.write.mode("overwrite").parquet(data_output_path)
    music_stats = spark_df.groupBy("favorite_music_type").agg(
        avg("age").alias("avg_age"),
        min("age").alias("min_age"),
        max("age").alias("max_age")
    )
    music_stats.write.mode("overwrite").parquet(stats_output_path)
    music_stats.show()

Overwriting get_stats.py


In [9]:
# function build and run.
function = mlrun.code_to_function(
            name="music_demo",
            kind="databricks",
            filename=code_path,
            image="mlrun/mlrun",
            # image="tomermamia855/mlrun-api:fix_dbfs_pod_tab",
        )

for name, val in job_env.items():
    function.spec.env.append({"name": name, "value": val})
    
params = {
    "task_parameters": {"timeout_minutes": 15},
    "data": df.to_json(),
    "data_output_path": dbfs_data_path,
    "stats_output_path": dbfs_stats_path,
}

### Run the mlrun function, which deligate the Spark job to Databricks clusetr

In [10]:
run = function.run(
    handler="main",
    project=project.name,
    params=params,
)

> 2023-11-30 18:25:27,973 [info] Storing function: {'name': 'music-demo-main', 'uid': 'd90adf3978ce43b8ba78c5565b9df3a5', 'db': 'http://mlrun-api:8080'}
> 2023-11-30 18:25:28,501 [info] Job is running in the background, pod: music-demo-main-5nt5w
> 2023-11-30 18:25:34,400 [info] run with exists cluster_id: 1116-014038-st5lherj
> 2023-11-30 18:25:34,614 [info] starting to poll: 605976299807721
> 2023-11-30 18:25:34,660 [info] workflow intermediate status: mlrun_task_75a4cc70-4414-4293-8a39-e7eeabb4ba24: RunLifeCycleState.PENDING
> 2023-11-30 18:25:36,093 [info] workflow intermediate status: mlrun_task_75a4cc70-4414-4293-8a39-e7eeabb4ba24: RunLifeCycleState.PENDING
> 2023-11-30 18:25:38,663 [info] workflow intermediate status: mlrun_task_75a4cc70-4414-4293-8a39-e7eeabb4ba24: RunLifeCycleState.PENDING
> 2023-11-30 18:25:41,997 [info] workflow intermediate status: mlrun_task_75a4cc70-4414-4293-8a39-e7eeabb4ba24: RunLifeCycleState.PENDING
> 2023-11-30 18:25:46,358 [info] workflow intermedia

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
databricks-proj,...5b9df3a5,0,Nov 30 18:25:32,completed,music-demo-main,v3io_user=xingshengkind=databricksowner=xingshengmlrun/client_version=1.5.1mlrun/client_python_version=3.9.16host=music-demo-main-5nt5w,,"task_parameters={'timeout_minutes': 15, 'spark_app_code': 'CgppbXBvcnQgYXJncGFyc2UKaW1wb3J0IGpzb24KcGFyc2VyID0gYXJncGFyc2UuQXJndW1lbnRQYXJzZXIoKQpwYXJzZXIuYWRkX2FyZ3VtZW50KCdoYW5kbGVyX2FyZ3VtZW50cycpCmhhbmRsZXJfYXJndW1lbnRzID0gcGFyc2VyLnBhcnNlX2FyZ3MoKS5oYW5kbGVyX2FyZ3VtZW50cwpoYW5kbGVyX2FyZ3VtZW50cyA9IGpzb24ubG9hZHMoaGFuZGxlcl9hcmd1bWVudHMpCgojICBIZXJlIGlzIGFuIGV4YW1wbGUgb2YgU3BhcmsgcHJvY2Vzc2luZy4KZnJvbSBweXNwYXJrLnNxbCBpbXBvcnQgU3BhcmtTZXNzaW9uCmZyb20gcHlzcGFyay5zcWwuZnVuY3Rpb25zIGltcG9ydCBhdmcsIG1pbiwgbWF4CmltcG9ydCBwYW5kYXMgYXMgcGQKaW1wb3J0IGpzb24KCmRlZiBtYWluKGRhdGE6IHN0ciwgZGF0YV9vdXRwdXRfcGF0aDogc3RyLCBzdGF0c19vdXRwdXRfcGF0aDogc3RyKToKICAgIHNwYXJrID0gU3BhcmtTZXNzaW9uLmJ1aWxkZXIuYXBwTmFtZSgiTXVzaWNEZW1vIikuZ2V0T3JDcmVhdGUoKQogICAganNvbl9kaWN0ID0ganNvbi5sb2FkcyhkYXRhKQogICAgcGFuZGFzX2RmID0gcGQuRGF0YUZyYW1lKGpzb25fZGljdCkKICAgIHNwYXJrX2RmID0gc3BhcmsuY3JlYXRlRGF0YUZyYW1lKHBhbmRhc19kZikKICAgIHNwYXJrX2RmLndyaXRlLm1vZGUoIm92ZXJ3cml0ZSIpLnBhcnF1ZXQoZGF0YV9vdXRwdXRfcGF0aCkKICAgIG11c2ljX3N0YXRzID0gc3BhcmtfZGYuZ3JvdXBCeSgiZmF2b3JpdGVfbXVzaWNfdHlwZSIpLmFnZygKICAgICAgICBhdmcoImFnZSIpLmFsaWFzKCJhdmdfYWdlIiksCiAgICAgICAgbWluKCJhZ2UiKS5hbGlhcygibWluX2FnZSIpLAogICAgICAgIG1heCgiYWdlIikuYWxpYXMoIm1heF9hZ2UiKQogICAgKQogICAgbXVzaWNfc3RhdHMud3JpdGUubW9kZSgib3ZlcndyaXRlIikucGFycXVldChzdGF0c19vdXRwdXRfcGF0aCkKICAgIG11c2ljX3N0YXRzLnNob3coKQoKbWFpbigqKmhhbmRsZXJfYXJndW1lbnRzKQo=', 'original_handler': 'main'}data={""id"":{""0"":1,""1"":2,""2"":3,""3"":4,""4"":5,""5"":6,""6"":7,""7"":8,""8"":9,""9"":10,""10"":11,""11"":12,""12"":13,""13"":14,""14"":15,""15"":16},""name"":{""0"":""Alice"",""1"":""Bob"",""2"":""Charlie"",""3"":""David"",""4"":""Eva"",""5"":""Frank"",""6"":""Grace"",""7"":""Henry"",""8"":""Ivy"",""9"":""Jack"",""10"":""Karen"",""11"":""Liam"",""12"":""Mia"",""13"":""Nora"",""14"":""Oliver"",""15"":""Ben""},""age"":{""0"":20,""1"":30,""2"":25,""3"":40,""4"":18,""5"":32,""6"":28,""7"":45,""8"":22,""9"":38,""10"":27,""11"":19,""12"":27,""13"":31,""14"":29,""15"":38},""favorite_music_type"":{""0"":""Pop"",""1"":""Rock"",""2"":""Pop"",""3"":""Classical"",""4"":""Pop"",""5"":""Rock"",""6"":""Pop"",""7"":""Classical"",""8"":""Pop"",""9"":""Classical"",""10"":""Pop"",""11"":""Pop"",""12"":""Rock"",""13"":""Rock"",""14"":""Pop"",""15"":""Pop""}}data_output_path=dbfs:///music_demo_1/music_data.parquetstats_output_path=dbfs:///music_demo_1/music_stats.parquet","databricks_runtime_task={'logs': '+-------------------+-----------------+-------+-------+\n|favorite_music_type| avg_age|min_age|max_age|\n+-------------------+-----------------+-------+-------+\n| Rock| 30.0| 27| 32|\n| Classical| 41.0| 38| 45|\n| Pop|25.11111111111111| 18| 38|\n+-------------------+-----------------+-------+-------+\n\n', 'logs_truncated': False, 'metadata': {'cleanup_duration': 0, 'creator_user_name': 'xingsheng_qian@mckinsey.com', 'end_time': 1701369103554, 'execution_duration': 51000, 'job_id': 52302675431099, 'number_in_job': 656206320040153, 'run_id': 656206320040153, 'run_name': 'mlrun_task_75a4cc70-4414-4293-8a39-e7eeabb4ba24', 'run_page_url': 'https://dbc-94c947ab-feb9.cloud.databricks.com/?o=4658245941722457#job/52302675431099/run/656206320040153', 'run_type': 'SUBMIT_RUN', 'setup_duration': 317000, 'start_time': 1701368734592, 'state': {'life_cycle_state': 'TERMINATED', 'result_state': 'SUCCESS', 'state_message': '', 'user_cancelled_or_timedout': False}, 'tasks': [{'attempt_number': 0, 'cleanup_duration': 0, 'cluster_instance': {'cluster_id': '1116-014038-st5lherj', 'spark_context_id': '3207010860226238303'}, 'end_time': 1701369103554, 'execution_duration': 51000, 'existing_cluster_id': '1116-014038-st5lherj', 'run_id': 656206320040153, 'setup_duration': 317000, 'spark_python_task': {'parameters': ['{""data"": ""{\\""id\\"":{\\""0\\"":1,\\""1\\"":2,\\""2\\"":3,\\""3\\"":4,\\""4\\"":5,\\""5\\"":6,\\""6\\"":7,\\""7\\"":8,\\""8\\"":9,\\""9\\"":10,\\""10\\"":11,\\""11\\"":12,\\""12\\"":13,\\""13\\"":14,\\""14\\"":15,\\""15\\"":16},\\""name\\"":{\\""0\\"":\\""Alice\\"",\\""1\\"":\\""Bob\\"",\\""2\\"":\\""Charlie\\"",\\""3\\"":\\""David\\"",\\""4\\"":\\""Eva\\"",\\""5\\"":\\""Frank\\"",\\""6\\"":\\""Grace\\"",\\""7\\"":\\""Henry\\"",\\""8\\"":\\""Ivy\\"",\\""9\\"":\\""Jack\\"",\\""10\\"":\\""Karen\\"",\\""11\\"":\\""Liam\\"",\\""12\\"":\\""Mia\\"",\\""13\\"":\\""Nora\\"",\\""14\\"":\\""Oliver\\"",\\""15\\"":\\""Ben\\""},\\""age\\"":{\\""0\\"":20,\\""1\\"":30,\\""2\\"":25,\\""3\\"":40,\\""4\\"":18,\\""5\\"":32,\\""6\\"":28,\\""7\\"":45,\\""8\\"":22,\\""9\\"":38,\\""10\\"":27,\\""11\\"":19,\\""12\\"":27,\\""13\\"":31,\\""14\\"":29,\\""15\\"":38},\\""favorite_music_type\\"":{\\""0\\"":\\""Pop\\"",\\""1\\"":\\""Rock\\"",\\""2\\"":\\""Pop\\"",\\""3\\"":\\""Classical\\"",\\""4\\"":\\""Pop\\"",\\""5\\"":\\""Rock\\"",\\""6\\"":\\""Pop\\"",\\""7\\"":\\""Classical\\"",\\""8\\"":\\""Pop\\"",\\""9\\"":\\""Classical\\"",\\""10\\"":\\""Pop\\"",\\""11\\"":\\""Pop\\"",\\""12\\"":\\""Rock\\"",\\""13\\"":\\""Rock\\"",\\""14\\"":\\""Pop\\"",\\""15\\"":\\""Pop\\""}}"", ""data_output_path"": ""dbfs:///music_demo_1/music_data.parquet"", ""stats_output_path"": ""dbfs:///music_demo_1/music_stats.parquet""}'], 'python_file': 'dbfs:/home/xingsheng_qian@mckinsey.com/mlrun_databricks_runtime/mlrun_task_75a4cc70-4414-4293-8a39-e7eeabb4ba24.py'}, 'start_time': 1701368734592, 'state': {'life_cycle_state': 'TERMINATED', 'result_state': 'SUCCESS', 'state_message': '', 'user_cancelled_or_timedout': False}, 'task_key': 'mlrun_task_75a4cc70-4414-4293-8a39-e7eeabb4ba24'}]}}",





> 2023-11-30 18:31:58,115 [info] Run execution finished: {'status': 'completed', 'name': 'music-demo-main'}


In [11]:
assert run.status.state=="completed"

### Check the Spark job result 

In [12]:
print(dbfs_stats_path)

dbfs:///music_demo_1/music_stats.parquet


### Use mlrun.datastore.sources.ParquetSource to access the parquet file stored in Databricks cluster

In [13]:
from mlrun.datastore.sources import ParquetSource
music_stats = ParquetSource("music_stats", path=dbfs_stats_path)
print(music_stats)

{'kind': 'parquet', 'name': 'music_stats', 'path': 'dbfs:///music_demo_1/music_stats.parquet'}


In [14]:
music_stats.to_dataframe()

Unnamed: 0,favorite_music_type,avg_age,min_age,max_age
0,Rock,30.0,27,32
1,Classical,41.0,38,45
2,Pop,25.111111,18,38


In [15]:
assert music_stats.to_dataframe().shape[0]==3