# Run a notebook on databricks via runs submit

This notebook runs a notebook on azure databricks

Potential issue: it will need to use a service principal to authenticate to AML to log into the AML workspace to track the model

# So, do this via a pipeline...

based on the example notebook [here](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-use-databricks-as-compute-target.ipynb).

In [None]:
import os
import azureml.core
from azureml.core.runconfig import JarLibrary
from azureml.core.compute import ComputeTarget, DatabricksCompute
from azureml.exceptions import ComputeTargetException
from azureml.core import Workspace, Experiment
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.pipeline.steps import DatabricksStep
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

In [None]:
# http://eastus.azuredatabricks.net/files/top10/aml_config/config.json?o=4604276322347170
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

In [None]:
# Replace with your account info before running.
 
db_compute_name=os.getenv("DATABRICKS_COMPUTE_NAME", "") # Databricks compute name
db_resource_group=os.getenv("DATABRICKS_RESOURCE_GROUP", "") # Databricks resource group
db_workspace_name=os.getenv("DATABRICKS_WORKSPACE_NAME", "") # Databricks workspace name
db_access_token=os.getenv("DATABRICKS_ACCESS_TOKEN", "") # Databricks access token
 

In [None]:
try:
    databricks_compute = DatabricksCompute(workspace=ws, name=db_compute_name)
    print('Compute target {} already exists'.format(db_compute_name))
except ComputeTargetException:
    print('Compute not found, will use below parameters to attach new one')
    print('db_compute_name {}'.format(db_compute_name))
    print('db_resource_group {}'.format(db_resource_group))
    print('db_workspace_name {}'.format(db_workspace_name))
    print('db_access_token {}'.format(db_access_token))
    config = DatabricksCompute.attach_configuration(
        resource_group = db_resource_group,
        workspace_name = db_workspace_name,
        access_token= db_access_token)
    databricks_compute=ComputeTarget.attach(ws, db_compute_name, config)
    databricks_compute.wait_for_completion(True)

In [None]:
# Use the default blob storage
def_blob_store = Datastore(ws, "workspaceblobstore")
print('Datastore {} will be used'.format(def_blob_store.name))

# We are uploading a sample file in the local directory to be used as a datasource
def_blob_store.upload_files(files=["./testdata.txt"], target_path="dbtest", overwrite=False)

step_1_input = DataReference(datastore=def_blob_store, path_on_datastore="dbtest",
                                     data_reference_name="input")

step_1_output = PipelineData("output", datastore=def_blob_store)

In [None]:
os.getcwd()

In [None]:
%ls aml_config

In [None]:
from azureml.core.runconfig import RunConfiguration

runconfig = RunConfiguration()

In [None]:
runconfig.load(path='.', name='top10_adb')

In [None]:
?runconfig.load

In [None]:
# Use the default blob storage
def_blob_store = Datastore(ws, "workspaceblobstore")
print('Datastore {} will be used'.format(def_blob_store.name))

# We are uploading a sample file in the local directory to be used as a datasource
def_blob_store.upload_files(files=["./testdata.txt"], target_path="dbtest", overwrite=False)

In [None]:
step_1_input = DataReference(datastore=def_blob_store, path_on_datastore="dbtest",
                                     data_reference_name="input")

step_1_output = PipelineData("output", datastore=def_blob_store)

In [None]:
notebook_path=os.getenv("DATABRICKS_NOTEBOOK_PATH", "/Users/jeremr@microsoft.com/sample-notebook-for-pipeline") # Databricks notebook path

dbNbStep = DatabricksStep(
    name="DBNotebookInWS",
    inputs=[step_1_input],
    outputs=[step_1_output],
    spark_version="4.3.x-scala2.11",
    num_workers=1,
    notebook_path=notebook_path,
    notebook_params={'myparam': 'testparam'},
    run_name='DB_Notebook_demo',
    compute_target=databricks_compute,
    allow_reuse=False
)
## need to add runconfig to add libraries
## 

In [None]:
steps = [dbNbStep]
pipeline = Pipeline(workspace=ws, steps=steps)
pipeline_run = Experiment(ws, 'DB_Notebook_demo').submit(pipeline)
pipeline_run.wait_for_completion()

In [None]:
notebook_path=os.getenv("DATABRICKS_NOTEBOOK_PATH", "/Users/jeremr@microsoft.com/parallel_top10/rescore_top10") # Databricks notebook path
from azureml.core.runconfig import PyPiLibrary, JarLibrary 

In [None]:
## This works, but still requires interactive authentication.

dbNbStep = DatabricksStep(
    name="DBNotebookInWS",
#    inputs=[step_1_input],
#    outputs=[step_1_output],
#    notebook_params={'myparam': 'testparam'},
    spark_version="4.3.x-scala2.11",
    num_workers=8,
    notebook_path=notebook_path,
    run_name='rescore_top10',
    pypi_libraries=[PyPiLibrary(package="azureml-sdk[databricks]", repo=None)],
    jar_libraries=[JarLibrary(library='dbfs:/FileStore/jars/azure-cosmosdb-spark_2.3.0_2.11-1.2.2-uber.jar')],
    compute_target=databricks_compute,
    allow_reuse=False
)

In [None]:
steps = [dbNbStep]
pipeline = Pipeline(workspace=ws, steps=steps)
pipeline_run = Experiment(ws, 'rescore_top10').submit(pipeline)
pipeline_run.wait_for_completion()