# Benchmark pure pipeline:

Runtime comparison, of the translated pipelines. As well as showcase of the original and translated code.

In [8]:
import pathlib
import re
from inspect import cleandoc
from mlinspect import PipelineInspector
from mlinspect.utils import get_project_root

### The code of the pipeline we will translate:

In [9]:
pipeline_code = cleandoc("""
    import os
    import pandas as pd
    from mlinspect.utils import get_project_root

    COUNTIES_OF_INTEREST = ['county2', 'county3']

    patients = pd.read_csv(
        os.path.join( str(get_project_root()), "example_pipelines", "healthcare", "patients.csv"),
        na_values='?')
    histories = pd.read_csv(
        os.path.join( str(get_project_root()), "example_pipelines", "healthcare", "histories.csv"),
        na_values='?')

    data = patients.merge(histories, on=['ssn'])
    complications = data.groupby('age_group').agg(mean_complications=('complications', 'mean'))
    data = data.merge(complications, on=['age_group'])
    data['label'] = data['complications'] > 1.2 * data['mean_complications']
    data = data[['smoker', 'last_name', 'county', 'num_children', 'race', 'income', 'label']]
    data = data[data['county'].isin(COUNTIES_OF_INTEREST)]
    """)


print(pipeline_code)


import os
import pandas as pd
from mlinspect.utils import get_project_root

COUNTIES_OF_INTEREST = ['county2', 'county3']

patients = pd.read_csv(
    os.path.join( str(get_project_root()), "example_pipelines", "healthcare", "patients.csv"),
    na_values='?')
histories = pd.read_csv(
    os.path.join( str(get_project_root()), "example_pipelines", "healthcare", "histories.csv"),
    na_values='?')

data = patients.merge(histories, on=['ssn'])
complications = data.groupby('age_group').agg(mean_complications=('complications', 'mean'))
data = data.merge(complications, on=['age_group'])
data['label'] = data['complications'] > 1.2 * data['mean_complications']
data = data[['smoker', 'last_name', 'county', 'num_children', 'race', 'income', 'label']]
data = data[data['county'].isin(COUNTIES_OF_INTEREST)]


In [10]:
## The function to retrieve the generated code:
def get_sql_query(pipeline_code, mode, materialize):
    PipelineInspector \
        .on_pipeline_from_string(pipeline_code) \
        .execute_in_sql(dbms_connector=None, mode=mode, materialize=materialize)

    setup_file = \
        pathlib.Path(get_project_root() / r"mlinspect/to_sql/generated_code/create_table.sql")
    test_file = \
        pathlib.Path(get_project_root() / r"mlinspect/to_sql/generated_code/pipeline.sql")

    with setup_file.open("r") as file:
        set_up_code = file.read()

    with test_file.open("r") as file:
        test__code = file.read()

    return set_up_code, test__code

### The code generated using "CTE":

In [11]:
setup_code, test_code = get_sql_query(pipeline_code, mode="CTE", materialize=False)

print(setup_code + "\n" + test_code)



Just translation to SQL is performed! 
-> SQL-Code placed at: mlinspect/to_sql/generated_code

CREATE TABLE patients_mlinid0 (
	"id" INT,
	"first_name" VARCHAR(100),
	"last_name" VARCHAR(100),
	"race" VARCHAR(100),
	"county" VARCHAR(100),
	"num_children" INT,
	"income" FLOAT,
	"age_group" VARCHAR(100),
	"ssn" VARCHAR(100)
);

COPY patients_mlinid0("id", "first_name", "last_name", "race", "county", "num_children", "income", "age_group", "ssn") FROM '/home/luca/Documents/Bachelorarbeit/mlinspect/example_pipelines/healthcare/patients.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);


CREATE TABLE histories_mlinid1 (
	"smoker" VARCHAR(100),
	"complications" INT,
	"ssn" VARCHAR(100)
);

COPY histories_mlinid1("smoker", "complications", "ssn") FROM '/home/luca/Documents/Bachelorarbeit/mlinspect/example_pipelines/healthcare/histories.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);



WITH patients_mlinid0_ctid AS (
	SELECT *, ctid AS patients_mlinid0_ctid
	FROM pat

### The code generated using "VIEW":

In [12]:
setup_code, test_code = get_sql_query(pipeline_code, mode="VIEW", materialize=False)

view_no_mat = setup_code + "\n" + test_code
print(view_no_mat)


Just translation to SQL is performed! 
-> SQL-Code placed at: mlinspect/to_sql/generated_code

CREATE TABLE patients_mlinid0 (
	"id" INT,
	"first_name" VARCHAR(100),
	"last_name" VARCHAR(100),
	"race" VARCHAR(100),
	"county" VARCHAR(100),
	"num_children" INT,
	"income" FLOAT,
	"age_group" VARCHAR(100),
	"ssn" VARCHAR(100)
);

COPY patients_mlinid0("id", "first_name", "last_name", "race", "county", "num_children", "income", "age_group", "ssn") FROM '/home/luca/Documents/Bachelorarbeit/mlinspect/example_pipelines/healthcare/patients.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);


CREATE TABLE histories_mlinid1 (
	"smoker" VARCHAR(100),
	"complications" INT,
	"ssn" VARCHAR(100)
);

COPY histories_mlinid1("smoker", "complications", "ssn") FROM '/home/luca/Documents/Bachelorarbeit/mlinspect/example_pipelines/healthcare/histories.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);



CREATE VIEW patients_mlinid0_ctid AS (
	SELECT *, ctid AS patients_mlinid0_ctid
	F

### The code generated using "VIEW" + "MATERIALIZE":

Attention: the materialization will be done used internally to avoid redundancy in the inspection, so the result for
the entire pipeline will be the same (Besides IDs, as no reset is done between rounds!).

The performance difference can be seen here: example_to_sql/to_sql_pure_pipeline_benchmark.py

In [13]:
setup_code_m, test_code_m = get_sql_query(pipeline_code, mode="VIEW", materialize=True)
view_with_mat = setup_code_m + "\n" + test_code_m
print("The results are the same (besides IDs): " + str(re.sub('\d', '', view_no_mat) == re.sub('\d', '', view_with_mat)))


Just translation to SQL is performed! 
-> SQL-Code placed at: mlinspect/to_sql/generated_code

The results are the same (besides IDs):True


## Presentation of the benchmark results for executing the translated pipelines:

### The original results:
**Pipeline runtimes**:

<img alt="p_r" src="plots/HealthcarePurePipeComparison.png" width="200"/>

## When also considering "SimpleImputer" and "OneHotEncoder" from scikit-learn:
**Pipeline runtimes**:

<img alt="p_r" src="plots/HealthcarePurePipeComparisonSimpImpOneHot.png" width="200"/>