# Predict Flow

<h2 id="tocheading">Table of Contents</h2>
<div id="toc"></div>

In [19]:
%%javascript
// Javascript to generate Table of Contents from notebook headers. Re-execute it at the very begining and
// on document structure change
$.getScript('https://kmahelona.github.io/ipython_notebook_goodies/ipython_notebook_toc.js')

<IPython.core.display.Javascript object>

## Parameters

In [20]:
# Parameter: URL of the REST API
%env MLFACTORY_REST_API_URL=http://dl1.aureacentral.com:11764

env: MLFACTORY_REST_API_URL=http://dl1.aureacentral.com:11764


In [21]:
# Parameter
# Problem is created in UI, here we use its ID as parameter
problem_id = 200136

In [22]:
# Parameter
# Experiment is pre-created in UI, here we use its ID as parameter to load and reuse it
experiment_id = 650

## Import Libraries

In [23]:
# Install MLFactory SDK
!pip install mlfactory_sdk --upgrade --extra-index https://pypi.swarm.devfactory.com > /dev/null

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/amazonei_tensorflow_p36/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
# WARNING: This cell may take 15-20 mins to finish. If you don'y need Pandas Profiler data analysis, just skip this cell,
# as well as `a. Automatic Data Exploration` section

# System cell
# Sophisticated way to import pandas profiler into the Notebook. Work with kernel = 'conda_amazonei_tensorflow_p36
try:
    import pandas_profiling
except:
    !sudo /home/ec2-user/anaconda3/bin/conda update -n amazonei_tensorflow_p36 --all -y
    !sudo /home/ec2-user/anaconda3/bin/conda install -c conda-forge -n amazonei_tensorflow_p36 pandas-profiling imagehash -y
    !sudo /home/ec2-user/anaconda3/bin/conda update -n amazonei_tensorflow_p36 ipywidgets -y
finally:
    import pandas_profiling

from pandas_profiling import ProfileReport

Collecting package metadata (current_repodata.json): done
Solving environment: \ 
The environment is inconsistent, please check the package plan carefully
The following packages are causing the inconsistency:

  - defaults/linux-64::pandas==1.0.1=py36h0573a6f_0
  - conda-forge/linux-64::keras==2.3.1=py36_0
  - defaults/noarch::jupyterlab==1.2.6=pyhf63ae98_0
  - defaults/linux-64::scikit-learn==0.22.1=py36hd81dba3_0
  - defaults/linux-64::python-language-server==0.31.7=py36_0
  - defaults/linux-64::bkcharts==0.2=py36_0
  - defaults/linux-64::nb_conda==2.2.1=py36_0
  - defaults/noarch::numpydoc==0.9.2=py_0
  - defaults/linux-64::pytest-arraydiff==0.3=py36h39e3cac_0
  - defaults/linux-64::bottleneck==1.3.2=py36heb32a55_0
  - defaults/noarch::sphinx==2.4.0=py_0
  - defaults/linux-64::pywavelets==1.1.1=py36h7b6447c_0
  - defaults/noarch::pytest-astropy==0.8.0=py_0
  - defaults/linux-64::numexpr==2.7.1=py36h423224d_0
  - defaults/noarch::anaconda-project==0.8.4=py_0
  - conda-forge/linux-64:

In [None]:
# System cell
# Import required and common libs
import json
import time
import boto3
import os
import pandas as pd
import numpy as np
import datetime
import pyarrow
from itables import show


In [None]:
!conda install -y -c defaults -c conda-forge shap # > /dev/null
import shap
shap.initjs()

In [None]:
# System cell
# Import all required MLFactory classes, and init MLFactory API
from mlfactory.sdk.restclient.mlfactory_api import MLFactoryApi
from mlfactory.sdk.common import Util
from mlfactory.sdk.problem.base import Problem
from mlfactory.sdk.tf.transformations import Tfs
from mlfactory.sdk.common import ExperimentLoader
from mlfactory.sdk.tf.tf_execution import TfExecution


mlf_api = MLFactoryApi()

In [None]:
#Predict specific imports
import hiplot as hip
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from matplotlib.pyplot import xticks
%matplotlib inline
from sagemaker import get_execution_role
role = get_execution_role()
from mlfactory.sdk.predict.predict_config import PredictConfiguration
from mlfactory.sdk.aws_autopilot.aws_autopilot_config import AWSAutopilotConfiguration
from mlfactory.sdk.afe.afe_config import AFEConfiguration
from mlfactory.sdk.explainability.explainability_config import ExplainabilityConfiguration
from mlfactory.sdk.explainability.explainability_controller import ExplainabilityController

## Define Problem

In [None]:
#todo Output JSON as table
problem = Problem.load(problem_id)
problem

## Select Target

In [None]:
#get list of tables in this problem
tables = problem.table_names()
tables=['train_csv']


In [None]:
# Column to predict
# problem.columns(<table_name>)
target_column = 'Survived'

## Data Exploration

Here is the place to explore & visualize your data

In [None]:
# Table names under your problem
problem.table_names()

#### a. Automatic Data Exploration

Visualize and Analize your data automatically based on Pandas Profiler

In [None]:
# Set to the table name you want to explore
# Generally, you'd like to see here your main (targe time series) dataset
explore_table_name = "train_csv"

In [None]:
# Data frame from the exloration table
df_explore = problem.read_dataframe_from_table(explore_table_name)
df_explore.head()

In [None]:
# System Cell
# Automatic data exploration
profile = ProfileReport(df_explore, title=f"Exploration report for {explore_table_name}", explorative=True)
profile.to_widgets()

#### b. Automatic Anomaly Detection

Detect outliers in your data using MLFactory AnomalyDetection transformation

[Read details](https://docs.google.com/document/d/1xyV_paZdy3vW9S954korzOqmZVDX56Yavu2OA0zKT_Y/edit?usp=sharing)

In [None]:
# Uncomment and run this cell if you want to apply automatic anomalies detecion

anomaly_detector = Tfs.AnomalyDetection()
anomaly_detector.problem_id = problem.id()
anomaly_detector.table = explore_table_name
anomaly_detector.run()

In [None]:
# Uncomment and run this cell if you appled automatic anomalies detecion above.

status = anomaly_detector.tf_execution.refresh_status()
if status.is_done():
    df_explore = problem.read_dataframe_from_table(anomaly_detector.name)
    df_explore.head()
else:
    print(f"Anomalies detection is still in progress, please wait. Current status is {status}")


#### c. Custom Exploration

In [None]:
# Feel free to explore more data if you want!

### Data Transformations

### In-Memory and Server-Side Transformations Guide

You have two options to transform your data:
 1. Use `problem.read_dataframe_from_table(<table_name>)` to load all your data in memory into good old pandas DataFrame.
  Apply all transformations your need, and save data using `problem.write_dataframe_into_table(<df>, <table_name>)`.
  That suits perfectly if you have a small to medium size dataset which fits to RAM (up to several Gbs)
 2. Use server-side MLFactory SDK transformations. They run at our backend, take more time (minutes usually),
  but can handle, and are intended for, huge datasets.

Please find examples below

Transformations have been divided into 3 sections:
1. Transformations before Train Test Split
2. Train Test Split
3. Transformations after Train Test Split

#### Server-side transforation example

#### 1. Choose and configure

In [None]:
# a. To list all available server-side transformations, invoke "all()"

# Tfs.all()

In [None]:
# b. Choose one, e.g. `SelectColumns

# tf = Tfs.SelectColumns()

In [None]:
# c. To get help on a transformation (as well as almost any SDK class or function), invoke `?`,
# or put cursor on variable or function name and Shift+Tab

# tf?

In [None]:
# d. Configure the transformation
# Name of the transformation is also used as the name of output table you will find transformed data in
# By default, the name is assigned to the name of transformation itself, in snake case


# tf.name = "select_best_columns"

# tf.table = "raw_data_csv"

# tf.keys = ["item_id"]

# tf.problem_id = problem.id()

#### 2. Save and execute

In [None]:
# tf_execution = tf.run()

#### 3. Check status

In [None]:
# If notebook when offline while you've been waiting, you can always load the execution object from DB
# tf_execution = TfExecution.instance(problem_id, transformation_id)

# Check the transformation execution status
# tf_execution.refresh_status()

Now you should wait for (usually) several minutes. You can use those flags to understand if execution is complete and if it was successful

In [None]:
# tf_execution.refresh_status().is_done()
# tf_execution.refresh_status().is_successful()

#### Classic (Pandas) transforation example

#### 1. Read Data

In [None]:
# All table names for your problem
problem.table_names()

In [None]:
# Read data into pandas DataFrame
df = problem.read_dataframe_from_table("please specify table name")
df.head()

#### 2. Change Data

In [None]:
# Do changes using pandas
# ...

#### 3. Save data back to the table

In [None]:
# You can check available parameters running the line below
# problem.write_dataframe_into_table?

In [None]:
# Write data
# problem.write_dataframe_into_table(df, table_name=<new table name>)

### Transform Data to Required Format

### The Last Transformation

#### 1. If you use server-side transformations

In [None]:
# Select transformation class from Tfs.all() list, and replace <SELECT CLASS> with class name
tf_final = Tfs.<SELECT CLASS>()
# Last transformation in the chain should have the predefined name = "target_time_series"
# That's required by our backend: we don't specify time series table name explicitly but rather expect data to be
# present in a table named "target_time_series"
tf_final.name = "target_time_series"
tf_final.partitions = 1
tf_final.problem_id = problem.id

In [None]:
# System cell
# Long running

# Run transformation
tf_execution = tf_final.run()

In [None]:
# If notebook when offline while you've been waiting, you can always load the execution object from DB
# tf_execution = TfExecution.instance(problem_id, transformation_id)

# Wait until `is_done` is True
tf_execution.refresh_status()

#### 2. Or, If you use in-memory pandas transformations

In [None]:
# Read data
df = problem.read_dataframe_from_table('t1_problem_120')
df.head()
# Do changes using pandas
# ...
# Write date
# problem.write_dataframe_into_table(df, "target_time_series")

#### Transformations BEFORE Train Test Split

Usually Transformations which add features or modify existing features into new ones can be applied before Train Test Split.

For example:
- Absolute - Computes the absolute value of a number.
- SelectColumns - Selects specified columns of interest from the table.

Can be applied BEFORE the Train Test Split.

In [None]:
# Run transformations BEFORE splitting the data into train test
# Use the examples as shown above to run the transformations

#### Train Test Split

In [None]:
# # Uncomment and run this cell to apply train validation test split

# train_val_test_splitter = Tfs.Split()
# train_val_test_splitter.name = ""
# train_val_test_splitter.ignoreHeader = True
# train_val_test_splitter.table = ""
# train_val_test_splitter.p = [0.65, 0.15, 0.20]
# train_val_test_splitter.problem_id = problem.id()
# train_val_test_splitter.run()

#### Transformations AFTER Train Test Split

Some Transformations should be applied after the Train Test Split separately as otherwise they can lead to Leakage of Data.

These usually include Encoder, Scaler, Normalizer Transformations.

For example:
- OneHotEncode - Transforms the chosen columns into one-hot representations.
- AnomalyDetection - Adds is_outlier column where 1 defines an anomaly

Should be applied AFTER the Train Test Split.

In [None]:
# Run transformations AFTER splitting the data into train test
# Use the examples as shown above to run the transformations

## Create Experiment

In [None]:
# System cell
experiment = PredictConfiguration.load(problem_id=problem_id, experiment_id=experiment_id)

In [None]:
experiment

#### Required Fields (*)

Those fields are mandatory, and require you to set values

You may refer to [the XGBoost Documentation](https://github.com/dmlc/xgboost/blob/master/doc/parameter.rst#learning-task-parameters) for all possible `learning_objective`s and `comparison_metric`s.

In [None]:
# Please give your experiment a meaninful name
# It should be unique in the scope of the problem
experiment.name = "Demo"
experiment.description = "demo"

#specify the dataset for the experiment
# Format is : <train_test_split_transformation_name>_problem_<problem_id>_<i>
# i=0 for train data, i=1 for valid data and i=2 for test data

experiment.training_data_s3_uri = ""
experiment.validation_data_s3_uri = ""
experiment.test_data_s3_uri = ""

# Valid Pairs include
# binary:logistic -> error  | auc | aucpr
# multi:softprob  -> merror | auc | aucpr
# reg:logistic    -> rmse  | rmsle | mae | mape
experiment.learning_objective = "BinaryClassification"
experiment.comparison_metric = "rmse"

#### Optional Fields

You can leave the values as is, or adjust them if you want

In [None]:
# You may uncomment and change any of these default values

# experiment.positive_data_weight: float = 1.0
# experiment.hpt_max_jobs: int = 5
# experiment.hpt_max_parallel_jobs: int = 5
# experiment.train_instance_count: int = 1
# experiment.train_instance_type: str = "ml.m4.xlarge"
# experiment.test_instance_count: int = 1
# experiment.test_instance_type: str = "ml.m4.xlarge"


## Train Model

### Before running training make sure
  - first column contains the target value  [use : MoveColumnToFirstGlueTransformer]
  - the dataset headers are extracted and uploaded to S3 [use : UpdateColumnsGlueTransformer]

In [None]:
# System cell
# Long-running job
experiment.save_and_run_training()

That's it! Now all you need is to wait until training is over.

In [None]:
# When the status = 'Training complete', we can proceed further. Otherwise, we need to wait.
# Training could take several hours, or even more, on huge datasets
experiment = ExperimentLoader.load(problem_id, experiment_id)
experiment.status()

## Deploy Model

If you find the model valuable, you might consider deploying it to reuse in the future.
Uncomment and run the cell below

In [None]:
# System cell
# Long-running job

# experiment.deploy()

In [None]:
# System cell
# When deployment is done, you can run inferences using the inference link
# experiment.inference_link()
