# Benchmark pure pipeline:

Runtime comparison, of the translated pipelines. As well as showcase of the original and translated code.

In [1]:
import pathlib
import os
from mlinspect import PipelineInspector
from mlinspect.utils import get_project_root
from _code_as_string import get_healthcare_pipe_code

In [None]:
# The files from the original pipeline:
patients = os.path.join( str(get_project_root()), "example_pipelines", "healthcare", "patients.csv")
histories = os.path.join( str(get_project_root()), "example_pipelines", "healthcare", "histories.csv")

# Get the code of the pipeline:
setup_code_orig, test_code_orig = get_healthcare_pipe_code(histories, patients, add_impute_and_onehot=False)

### The code of the pipeline we will translate:

In [None]:
pipeline_code = setup_code_orig + "\n" + test_code_orig
print(pipeline_code)


In [None]:
## The function to retrieve the generated code:
def get_healthcare_sql_str(pipeline_code, mode, materialize):
    PipelineInspector \
        .on_pipeline_from_string(pipeline_code) \
        .execute_in_sql(dbms_connector=None, mode=mode, materialize=materialize)

    setup_file = \
        pathlib.Path(get_project_root() / r"mlinspect/to_sql/generated_code/create_table.sql")
    test_file = \
        pathlib.Path(get_project_root() / r"mlinspect/to_sql/generated_code/pipeline.sql")

    with setup_file.open("r") as file:
        setup_code = file.read()

    with test_file.open("r") as file:
        test_code = file.read()

    return setup_code, test_code

### The code generated using "CTE":

In [None]:
setup_code, test_code = get_healthcare_sql_str(pipeline_code, mode="CTE", materialize=False)

print(setup_code + "\n" + test_code)


### The code generated using "VIEW":

In [None]:
setup_code, test_code = get_healthcare_sql_str(pipeline_code, mode="VIEW", materialize=False)

print(setup_code + "\n" + test_code)

### The code generated using "VIEW" + "MATERIALIZE":

In [None]:
setup_code, test_code = get_healthcare_sql_str(pipeline_code, mode="VIEW", materialize=True)

print(setup_code + "\n" + test_code)

## Presentation of the benchmark results for executing the translated pipelines:

### The original results:
**Pipeline runtimes**:

![p_r](plots/HealthcarePurePipeComparison.png)

## When also considering "SimpleImputer" and "OneHotEncoder" from scikit-learn:
**Pipeline runtimes**:

![p_r](plots/HealthcarePurePipeComparisonSimpImpOneHot.png)