# Benchmark pure pipeline:

Runtime comparison, of the translated pipelines. As well as showcase of the original and translated code.

In [1]:
import pathlib
from inspect import cleandoc
from mlinspect import PipelineInspector
from mlinspect.utils import get_project_root

### The code of the "healthcare" pipeline that will be translated:

In [2]:
# No model training:
HEALTHCARE_FILE_PY_R = os.path.join(str(get_project_root()), "test", "monkeypatchingSQL", "pipelines_for_tests",
                                    "healthcare", "healthcare_res.py")
COMPAS_FILE_PY_R = os.path.join(str(get_project_root()), "test", "monkeypatchingSQL", "pipelines_for_tests", "compas",
                              "compas_res.py")
ADULT_SIMPLE_FILE_PY_R = os.path.join(str(get_project_root()), "test", "monkeypatchingSQL", "pipelines_for_tests",
                                    "adult_simple", "adult_simple_res.py")
ADULT_COMPLEX_FILE_PY_R = os.path.join(str(get_project_root()), "test", "monkeypatchingSQL", "pipelines_for_tests",
                                     "adult_complex", "adult_complex_res.py")

HEALTHCARE_BIAS = ['age_group', 'race']
COMPAS_BIAS = ['sex', 'race']


/home/maximilian/TUM/thesis/scalerandi/mlinspect/test/monkeypatchingSQL/pipelines_for_tests/adult_simple/adult_simple_res.py


In [5]:
## The function to retrieve the generated code:
def get_sql_query(file_location, mode, materialize):
    PipelineInspector \
        .on_pipeline_from_py_file(file_location) \
        .execute_in_sql(dbms_connector=None, mode=mode, materialize=materialize)

    setup_file = \
        pathlib.Path(get_project_root() / r"mlinspect/to_sql/generated_code/create_table.sql")
    test_file = \
        pathlib.Path(get_project_root() / r"mlinspect/to_sql/generated_code/pipeline.sql")

    with setup_file.open("r") as file:
        set_up_code = file.read()

    with test_file.open("r") as file:
        test__code = file.read()

    return set_up_code, test__code

## Generated Code Showcase:
The code generated shows all the sql_code for the last call on the DBMS. The commented parts are the queries with which
the data for training, testing and validation were retrieved.

In [6]:
# The code generated using "CTE":
setup_code, test_code = get_sql_query(ADULT_SIMPLE_FILE_PY_R, mode="CTE", materialize=False)
print(setup_code + "\n" + test_code)



Just translation to SQL is performed! 
-> SQL-Code placed at: mlinspect/to_sql/generated_code.sql



2022-02-05 13:56:58.201925: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib64
2022-02-05 13:56:58.201998: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


pipeline start
CREATE TABLE adult_train_1_mlinid0 (
	"Unnamed: 0" INT,
	"age" INT,
	"workclass" VARCHAR(100),
	"fnlwgt" INT,
	"education" VARCHAR(100),
	"education-num" INT,
	"marital-status" VARCHAR(100),
	"occupation" VARCHAR(100),
	"relationship" VARCHAR(100),
	"race" VARCHAR(100),
	"sex" VARCHAR(100),
	"capital-gain" INT,
	"capital-loss" INT,
	"hours-per-week" INT,
	"native-country" VARCHAR(100),
	"income-per-year" VARCHAR(100)
);

COPY adult_train_1_mlinid0("Unnamed: 0", "age", "workclass", "fnlwgt", "education", "education-num", "marital-status", "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss", "hours-per-week", "native-country", "income-per-year") FROM '/home/maximilian/TUM/thesis/scalerandi/mlinspect/test/monkeypatchingSQL/pipelines_for_tests/adult_complex/adult_train.csv' WITH (DELIMITER ',', NULL '', FORMAT CSV, HEADER TRUE);



WITH adult_train_1_mlinid0_ctid AS (
	SELECT *, ctid AS adult_train_1_mlinid0_ctid
	FROM adult_train_1_mlinid0
),
block_m

In [11]:
# The code generated using "VIEW":
setup_code, test_code = get_sql_query(ADULT_SIMPLE_FILE_PY_R, mode="VIEW", materialize=False)
view_no_mat = setup_code + "\n" + test_code
print(view_no_mat)


Just translation to SQL is performed! 
-> SQL-Code placed at: mlinspect/to_sql/generated_code.sql

Mean accuracy: 
CREATE TABLE patients_201_mlinid0 (
	"id" INT,
	"first_name" VARCHAR(100),
	"last_name" VARCHAR(100),
	"race" VARCHAR(100),
	"county" VARCHAR(100),
	"num_children" INT,
	"income" FLOAT,
	"age_group" VARCHAR(100),
	"ssn" VARCHAR(100)
);

COPY patients_201_mlinid0("id", "first_name", "last_name", "race", "county", "num_children", "income", "age_group", "ssn") FROM '/home/maximilian/TUM/thesis/scalerandi/mlinspect/example_pipelines/healthcare/patients.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);


CREATE TABLE histories_202_mlinid1 (
	"smoker" VARCHAR(100),
	"complications" INT,
	"ssn" VARCHAR(100)
);

COPY histories_202_mlinid1("smoker", "complications", "ssn") FROM '/home/maximilian/TUM/thesis/scalerandi/mlinspect/example_pipelines/healthcare/histories.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);



CREATE VIEW patients_201_mlinid0_ctid AS

### Full example of the entire code generated using "VIEW" + "MATERIALIZE":

In [12]:
# The code generated using "VIEW" and "MATERIALIZED":
setup_code_m, test_code_m = get_sql_query(ADULT_SIMPLE_FILE_PY_R, mode="VIEW", materialize=True)
view_with_mat = setup_code_m + "\n" + test_code_m
print(view_with_mat)




Just translation to SQL is performed! 
-> SQL-Code placed at: mlinspect/to_sql/generated_code.sql

Mean accuracy: 
CREATE TABLE patients_251_mlinid0 (
	"id" INT,
	"first_name" VARCHAR(100),
	"last_name" VARCHAR(100),
	"race" VARCHAR(100),
	"county" VARCHAR(100),
	"num_children" INT,
	"income" FLOAT,
	"age_group" VARCHAR(100),
	"ssn" VARCHAR(100)
);

COPY patients_251_mlinid0("id", "first_name", "last_name", "race", "county", "num_children", "income", "age_group", "ssn") FROM '/home/maximilian/TUM/thesis/scalerandi/mlinspect/example_pipelines/healthcare/patients.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);


CREATE TABLE histories_252_mlinid1 (
	"smoker" VARCHAR(100),
	"complications" INT,
	"ssn" VARCHAR(100)
);

COPY histories_252_mlinid1("smoker", "complications", "ssn") FROM '/home/maximilian/TUM/thesis/scalerandi/mlinspect/example_pipelines/healthcare/histories.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);



CREATE VIEW patients_251_mlinid0_ctid AS