# 5. Training 
 --------------------------------------------------------------------

Train a model using a snapshot of the data.

![Model deployment with streaming Real-time operational Pipeline](../../assets/images/model-deployment-with-streaming.png)

In [1]:
%run config.py

## Step 1: Create and Configure a New Project <a id="gs-step-project-create-n-config"></a>

### Create a New Project <a id="gs-create-project"></a>

In [2]:
from os import path, getenv
from mlrun import new_project

project_name = '-'.join(filter(None, [PROJECT_NAME, getenv('V3IO_USERNAME', None)]))
project_path = path.abspath('conf')
project = new_project(project_name, project_path, init_git=True)

print(f'Project path: {project_path}\nProject name: {project_name}')

Project path: /User/_code2/tutorials/demos/rapid-streaming-ml/conf
Project name: model-deployment-with-streaming-iguazio


### Configure MLRun <a id="gs-mlrun-config"></a>

In [3]:
from mlrun import run_local, NewTask, mlconf, import_function, mount_v3io

# Target location for storing pipeline artifacts
artifact_path = path.abspath('artifacts')
# MLRun DB path or API service URL
mlconf.dbpath = mlconf.dbpath or 'http://mlrun-api:8080'

print(f'Artifacts path: {artifact_path}\nMLRun DB path: {mlconf.dbpath}')

Artifacts path: /User/_code2/tutorials/demos/rapid-streaming-ml/artifacts
MLRun DB path: http://mlrun-api:8080


#### Import Nuclio <a id="gs-nuclio-import"></a>


In [4]:
import nuclio

In [5]:
# nuclio: start-code

In [6]:
%nuclio config spec.build.baseImage = "mlrun/ml-models"
%nuclio config kind = "job"
%nuclio cmd -c pip install v3io-frames==0.6.18

%nuclio: setting spec.build.baseImage to 'mlrun/ml-models'
%nuclio: setting kind to 'job'


## Step 2: Take a Snapshot Data Set of the KV feature table <a id="gs-step-ingest-data"></a>


### Initialization <a id="gs-step-ingest-data-init"></a>


In [7]:
# nuclio: ignore
# Set the source-data URL
container = CONTAINER
table_path = FEATURE_TABLE_PATH
columns = ['label', 'socioeconomic_idx', 
           'purchase_sum', 'purchase_mean', 'purchase_count', 'purchase_var', 
           'bet_sum', 'bet_mean', 'bet_count' ,'bet_var',
           'win_sum', 'win_mean', 'win_count' ,'win_var']
format = 'csv'

### Define a Data-Snapshot Function <a id="gs-step-ingest-data-define-function"></a>


In [8]:
from os import path
import pandas as pd
import v3io_frames as v3f


# Ingest a data set into the platform
def snapshot_data(context, container, table_path, columns, format='csv'):
    
    client = v3f.Client("framesd:8081", container=container)
    client.execute(backend="kv", table=table_path, command="infer")
    df = client.read('kv', table_path, columns=columns)
    
    target_path = path.join(context.artifact_path, 'data')
    # Optionally print data to your logger
    context.logger.info('Saving snapshot data set to {} ...'.format(target_path))
    
    # Store the data set in your artifacts database
    context.log_dataset('snapshot_dataset', df=df, format=format,
                        index=False, artifact_path=target_path)

In [9]:
# nuclio: end-code

<a id='gs-run-ingest-func-local'></a>
#### Run the Function Locally in Jupyter Notebook

In [10]:
# Run the data-ingestion function locally in Jupyter Notebook
snapshot_data_run = run_local(name='snapshot_data',
                         handler=snapshot_data,
                         params={'container': container, 'table_path': table_path, 
                                 'columns':columns, 'format': format},
                         project=project_name, artifact_path=artifact_path)

[mlrun] 2020-08-04 22:02:28,956 starting run snapshot_data uid=5160ef50841842658552e618638d5364  -> http://mlrun-api:8080
[mlrun] 2020-08-04 22:02:29,073 Saving snapshot data set to /User/_code2/tutorials/demos/rapid-streaming-ml/artifacts/data ...



project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
model-deployment-with-streaming-iguazio,...638d5364,0,Aug 04 22:02:29,completed,snapshot_data,v3io_user=iguaziokind=handlerowner=iguaziohost=jupyter-8457c6d465-zld5w,,"container=userstable_path=/iguazio/examples/model-deployment-with-streaming/feature-tablecolumns=['label', 'socioeconomic_idx', 'purchase_sum', 'purchase_mean', 'purchase_count', 'purchase_var', 'bet_sum', 'bet_mean', 'bet_count', 'bet_var', 'win_sum', 'win_mean', 'win_count', 'win_var']format=csv",,snapshot_dataset


to track results use .show() or .logs() or in CLI: 
!mlrun get run 5160ef50841842658552e618638d5364 --project model-deployment-with-streaming-iguazio , !mlrun logs 5160ef50841842658552e618638d5364 --project model-deployment-with-streaming-iguazio
[mlrun] 2020-08-04 22:02:29,314 run executed, status=completed


#### Run the Function on a Cluster <a id="gs-run-ingest-func-cluster"></a>


In [11]:
from mlrun import code_to_function

# Convert the local snapshot_data function into a gen_func project function
gen_func = code_to_function(name='snapshot-data')
project.set_function(gen_func)
snapshot_data_func = project.func('snapshot-data').apply(mount_v3io())

In [12]:
#Build image
snapshot_data_func.deploy()

[mlrun] 2020-08-04 22:02:32,564 starting remote build, image: .mlrun/func-model-deployment-with-streaming-iguazio-snapshot-data-latest
[36mINFO[0m[0000] Resolved base name mlrun/ml-models:0.4.10 to mlrun/ml-models:0.4.10 
[36mINFO[0m[0000] Resolved base name mlrun/ml-models:0.4.10 to mlrun/ml-models:0.4.10 
[36mINFO[0m[0000] Retrieving image manifest mlrun/ml-models:0.4.10 
[36mINFO[0m[0000] Retrieving image manifest mlrun/ml-models:0.4.10 
[36mINFO[0m[0000] Built cross stage deps: map[]                
[36mINFO[0m[0000] Retrieving image manifest mlrun/ml-models:0.4.10 
[36mINFO[0m[0001] Retrieving image manifest mlrun/ml-models:0.4.10 
[36mINFO[0m[0001] Unpacking rootfs as cmd RUN pip install v3io-frames==0.6.18 requires it. 
[36mINFO[0m[0099] Taking snapshot of full filesystem...        
[36mINFO[0m[0123] Resolving paths                              
[36mINFO[0m[0145] RUN pip install v3io-frames==0.6.18          
[36mINFO[0m[0145] cmd: /bin/sh                 

True

##### Run the Function on the Cluster <a id="gs-run-ingest-func-on-the-cluster-run-function"></a>


In [13]:
snapshot_data_run = snapshot_data_func.run(name='snapshot_data',
                                 handler='snapshot_data',
                                 params={'container': container, 'table_path': table_path, 
                                         'columns':columns, 'format': format},
                                 artifact_path=artifact_path)

[mlrun] 2020-08-04 22:06:08,151 starting run snapshot_data uid=bbd91a9111ca498f8a72a4b656447837  -> http://mlrun-api:8080
[mlrun] 2020-08-04 22:06:08,344 Job is running in the background, pod: snapshot-data-xc2l5
[mlrun] 2020-08-04 22:07:21,915 Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/site-packages/v3io_frames/grpc.py", line 49, in wrapper
    return fn(*args, **kw)
  File "/opt/conda/lib/python3.7/site-packages/v3io_frames/grpc.py", line 208, in _execute
    resp = stub.Exec(request)
  File "/opt/conda/lib/python3.7/site-packages/grpc/_interceptor.py", line 221, in __call__
    compression=compression)
  File "/opt/conda/lib/python3.7/site-packages/grpc/_interceptor.py", line 257, in _with_call
    return call.result(), call
  File "/opt/conda/lib/python3.7/site-packages/grpc/_channel.py", line 333, in result
    raise self
  File "/opt/conda/lib/python3.7/site-packages/grpc/_interceptor.py", line 247, in continuation
    compression=new_compression)
  File 

project,uid,iter,start,state,name,labels,inputs,parameters,results,artifacts
model-deployment-with-streaming-iguazio,...56447837,0,Aug 04 22:07:21,error,snapshot_data,v3io_user=iguaziokind=jobowner=iguaziohost=snapshot-data-xc2l5,,"container=userstable_path=/iguazio/examples/model-deployment-with-streaming/feature-tablecolumns=['label', 'socioeconomic_idx', 'purchase_sum', 'purchase_mean', 'purchase_count', 'purchase_var', 'bet_sum', 'bet_mean', 'bet_count', 'bet_var', 'win_sum', 'win_mean', 'win_count', 'win_var']format=csv",,


to track results use .show() or .logs() or in CLI: 
!mlrun get run bbd91a9111ca498f8a72a4b656447837 --project model-deployment-with-streaming-iguazio , !mlrun logs bbd91a9111ca498f8a72a4b656447837 --project model-deployment-with-streaming-iguazio
[mlrun] 2020-08-04 22:07:27,881 run executed, status=error
runtime error: error in _execute: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.UNKNOWN
	details = "can't exec: type 'string' of value '042abfee-c9bd-48b6-9eea-efd08fbffd3d' doesn't match type 'int' of value '111111' for column 'user_id'."
	debug_error_string = "{"created":"@1596578841.914158832","description":"Error received from peer ipv4:10.197.43.157:8081","file":"src/core/lib/surface/call.cc","file_line":1055,"grpc_message":"can't exec: type 'string' of value '042abfee-c9bd-48b6-9eea-efd08fbffd3d' doesn't match type 'int' of value '111111' for column 'user_id'.","grpc_status":2}"
>


RunError: error in _execute: <_InactiveRpcError of RPC that terminated with:
	status = StatusCode.UNKNOWN
	details = "can't exec: type 'string' of value '042abfee-c9bd-48b6-9eea-efd08fbffd3d' doesn't match type 'int' of value '111111' for column 'user_id'."
	debug_error_string = "{"created":"@1596578841.914158832","description":"Error received from peer ipv4:10.197.43.157:8081","file":"src/core/lib/surface/call.cc","file_line":1055,"grpc_message":"can't exec: type 'string' of value '042abfee-c9bd-48b6-9eea-efd08fbffd3d' doesn't match type 'int' of value '111111' for column 'user_id'.","grpc_status":2}"
>

## Step 3: Explore the Data <a id="gs-step-explore-data"></a>


### Explore the Data with Pandas DataFrames <a id="gs-explore-data-w-pandas-df"></a>


Run the following code to use pandas DataFrames to read your data set, extract some basic statistics, and display them.

In [None]:
# Read your data set
from mlrun.run import get_dataitem
df = get_dataitem(snapshot_data_run.outputs['snapshot_dataset']).as_df()

In [None]:
# Display a portion of the read data
df.head()

In [None]:
# Calculate and display the number of data-set items
print(f'Total number of rows: {len(df)}')

In [None]:
# Display statistics grouped by label
df.groupby(['label']).describe()

### Explore the Data with an MLRun Marketplace Function <a id="gs-explore-data-w-mlrun-func"></a>


#### Add an Exploration Function <a id="gs-explore-data-w-mlrun-func-add-func"></a>


In [None]:
project.set_function('hub://describe', 'describe')
describe = project.func('describe').apply(mount_v3io())

#### Run the Exploration Function <a id="gs-explore-data-w-mlrun-func-run-func"></a>


In [None]:
describe_run = describe.run(params={'label_column': 'label'},
                            inputs={"table":
                                    snapshot_data_run.outputs['snapshot_dataset']},
                            artifact_path=artifact_path)

#### Review the Run Output <a id="gs-explore-data-w-mlrun-func-run-output-review"></a>


In [None]:
from IPython.display import display, HTML

# Display the `histograms` artifact
display(HTML('<h3>Histograms</h3>'), HTML(describe_run.outputs['histograms']))

In [None]:
# Display the `imbalance` artifact
display(HTML('<h3>Imbalance<h3>'),
        HTML(filename=describe_run.outputs['imbalance']))

In [None]:
# Display the `correlation` artifact
display(HTML('<h3>Correlation Matrix<h3>'),
        HTML(filename=describe_run.outputs['correlation']))

## Step 4: Run Model Training at Scale <a id="gs-step-run-model-training"></a>


#### Add a Training Function <a id="gs-train-model-add-func"></a>


In [None]:
project.set_function('hub://sklearn_classifier', 'train')
train = project.func('train').apply(mount_v3io())

#### Run a Model-Training Task <a id="gs-train-model-run-task"></a>


In [None]:
# Configure the models to train
models = ["sklearn.ensemble.RandomForestClassifier", 
          "sklearn.linear_model.LogisticRegression",
          "sklearn.ensemble.AdaBoostClassifier"]

In [None]:
# Create a training task
train_task = NewTask(name="train",
                     params={"sample": -1,
                             "label_column": "label",
                             "test_size": 0.10},
                     inputs={"dataset": snapshot_data_run.outputs['snapshot_dataset']})

# Run the training task
train_run = train.run(train_task.with_hyper_params({'model_pkg_class': models},
                                                   selector='max.accuracy'),
                      artifact_path=artifact_path)

### Review the Run Output <a id="gs-train-model-run-output-review"></a>


In [None]:
# Display the name of the selected model
display(HTML(f'<b>Best model:</b> '
             f'{models[train_run.outputs["best_iteration"]-1]}'))

# Display the accuracy for the optimal run iteration
display(HTML(f'<b>Accuracy:</b> {train_run.outputs["accuracy"]}'))

# Display HTML output artifacts
display(HTML('<h3>Confusion Matrix:</h3>'),
        HTML(filename=train_run.outputs['confusion-matrix']))
display(HTML('<h3>ROC Curve:</h3>'), HTML(filename=train_run.outputs['roc-binary']))

## Step 5: Test Your Model <a id="gs-step-test-model"></a>


In [None]:
project.set_function('hub://test_classifier', 'test')
test = project.func('test').apply(mount_v3io())

#### Run a Model-Testing Task <a id="gs-test-model-run-task"></a>


In [None]:
test_task = NewTask(name="test",
                    params={"label_column": "label",
                            "plots_dest": path.join("plots", "test")},
                    inputs={"models_path": train_run.outputs['model'],
                            "test_set": train_run.outputs['test_set']}
                    )
test_run = test.run(test_task,
                    artifact_path=artifact_path)

### Review the Run Output <a id="gs-test-model-run-output-review"></a>


In [None]:
# Display the model accuracy
display(HTML(f'<b>Test Accuracy:</b> {test_run.outputs["accuracy"]}'))

# Display HTML output artifacts
display(HTML("<h3>Confusion matrix:</h3>"),
        HTML(filename=test_run.outputs['confusion-matrix']))
display(HTML("<h3>ROC Curve:</h3>"), HTML(filename=test_run.outputs['roc-binary']))

## Step 6: Save project<a id="gs-step-create-n-run-ml-pipeline"></a>


In [None]:
project.save(path.join(project_path, 'project.yaml'))