# Benchmark pure pipeline:

Runtime comparison, of the translated pipelines. As well as showcase of the original and translated code.

In [1]:
import pathlib
from inspect import cleandoc
from mlinspect import PipelineInspector
from mlinspect.utils import get_project_root

### The code of the "healthcare" pipeline that will be translated:

In [2]:
pipeline_code = cleandoc("""
    import warnings
    import os
    import pandas as pd
    from sklearn.compose import ColumnTransformer
    from sklearn.impute import SimpleImputer
    from sklearn.model_selection import train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import OneHotEncoder, StandardScaler
    from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer, MyKerasClassifier, \
        create_model
    from mlinspect.utils import get_project_root

    COUNTIES_OF_INTEREST = ['county2', 'county3']

    patients = pd.read_csv(
        os.path.join( str(get_project_root()), "example_pipelines", "healthcare", "patients.csv"),
        na_values='?')
    histories = pd.read_csv(
        os.path.join( str(get_project_root()), "example_pipelines", "healthcare", "histories.csv"),
        na_values='?')

    data = patients.merge(histories, on=['ssn'])
    complications = data.groupby('age_group').agg(mean_complications=('complications', 'mean'))
    data = data.merge(complications, on=['age_group'])
    data['label'] = data['complications'] > 1.2 * data['mean_complications']
    data = data[['smoker', 'last_name', 'county', 'num_children', 'race', 'income', 'label']]
    data = data[data['county'].isin(COUNTIES_OF_INTEREST)]
    impute_and_one_hot_encode = Pipeline([
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])
    featurisation = ColumnTransformer(transformers=[
        ("impute_and_one_hot_encode", impute_and_one_hot_encode, ['smoker', 'county', 'race']),
        # ('word2vec', MyW2VTransformer(min_count=2), ['last_name']),
        ('numeric', StandardScaler(), ['num_children', 'income']),
    ], remainder='drop')
    neural_net = MyKerasClassifier(build_fn=create_model, epochs=10, batch_size=1, verbose=0)
    pipeline = Pipeline([
        ('features', featurisation),
        ('learner', neural_net)
    ])
    train_data, test_data = train_test_split(data)
    model = pipeline.fit(train_data, train_data['label'])
    print("Mean accuracy: {}".format(model.score(test_data, test_data['label'])))
""")

print(pipeline_code)


import os
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from example_pipelines.healthcare.healthcare_utils import MyW2VTransformer, MyKerasClassifier,         create_model
from mlinspect.utils import get_project_root

COUNTIES_OF_INTEREST = ['county2', 'county3']

patients = pd.read_csv(
    os.path.join( str(get_project_root()), "example_pipelines", "healthcare", "patients.csv"),
    na_values='?')
histories = pd.read_csv(
    os.path.join( str(get_project_root()), "example_pipelines", "healthcare", "histories.csv"),
    na_values='?')

data = patients.merge(histories, on=['ssn'])
complications = data.groupby('age_group').agg(mean_complications=('complications', 'mean'))
data = data.merge(complications, on=['age_group'])
data['label'] = data['complications'] > 1.2 * data['

In [3]:
## The function to retrieve the generated code:
def get_sql_query(pipe_code, mode, materialize):
    PipelineInspector \
        .on_pipeline_from_string(pipe_code) \
        .execute_in_sql(dbms_connector=None, mode=mode, materialize=materialize)

    setup_file = \
        pathlib.Path(get_project_root() / r"mlinspect/to_sql/generated_code/create_table.sql")
    test_file = \
        pathlib.Path(get_project_root() / r"mlinspect/to_sql/generated_code/pipeline.sql")

    with setup_file.open("r") as file:
        set_up_code = file.read()

    with test_file.open("r") as file:
        test__code = file.read()

    return set_up_code, test__code

## Generated Code Showcase:
The code generated shows all the sql_code for the last call on the DBMS. The commented parts are the queries with which
the data for training, testing and validation were retrieved.

In [4]:
# The code generated using "CTE":
setup_code, test_code = get_sql_query(pipeline_code, mode="CTE", materialize=False)
print(setup_code + "\n" + test_code)



Just translation to SQL is performed! 
-> SQL-Code placed at: mlinspect/to_sql/generated_code.sql



2021-09-14 13:49:16.576465: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-09-14 13:49:16.576489: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


Mean accuracy: 
CREATE TABLE patients_1_mlinid0 (
	"id" INT,
	"first_name" VARCHAR(100),
	"last_name" VARCHAR(100),
	"race" VARCHAR(100),
	"county" VARCHAR(100),
	"num_children" INT,
	"income" FLOAT,
	"age_group" VARCHAR(100),
	"ssn" VARCHAR(100)
);

COPY patients_1_mlinid0("id", "first_name", "last_name", "race", "county", "num_children", "income", "age_group", "ssn") FROM '/home/luca/Documents/Bachelorarbeit/mlinspect/example_pipelines/healthcare/patients.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);


CREATE TABLE histories_2_mlinid1 (
	"smoker" VARCHAR(100),
	"complications" INT,
	"ssn" VARCHAR(100)
);

COPY histories_2_mlinid1("smoker", "complications", "ssn") FROM '/home/luca/Documents/Bachelorarbeit/mlinspect/example_pipelines/healthcare/histories.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);



WITH patients_1_mlinid0_ctid AS (
	SELECT *, ctid AS patients_1_mlinid0_ctid
	FROM patients_1_mlinid0
),
histories_2_mlinid1_ctid AS (
	SELECT *, ctid AS 

In [5]:
# The code generated using "VIEW":
setup_code, test_code = get_sql_query(pipeline_code, mode="VIEW", materialize=False)
view_no_mat = setup_code + "\n" + test_code
print(view_no_mat)


Just translation to SQL is performed! 
-> SQL-Code placed at: mlinspect/to_sql/generated_code.sql

Mean accuracy: 
CREATE TABLE patients_51_mlinid0 (
	"id" INT,
	"first_name" VARCHAR(100),
	"last_name" VARCHAR(100),
	"race" VARCHAR(100),
	"county" VARCHAR(100),
	"num_children" INT,
	"income" FLOAT,
	"age_group" VARCHAR(100),
	"ssn" VARCHAR(100)
);

COPY patients_51_mlinid0("id", "first_name", "last_name", "race", "county", "num_children", "income", "age_group", "ssn") FROM '/home/luca/Documents/Bachelorarbeit/mlinspect/example_pipelines/healthcare/patients.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);


CREATE TABLE histories_52_mlinid1 (
	"smoker" VARCHAR(100),
	"complications" INT,
	"ssn" VARCHAR(100)
);

COPY histories_52_mlinid1("smoker", "complications", "ssn") FROM '/home/luca/Documents/Bachelorarbeit/mlinspect/example_pipelines/healthcare/histories.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);



CREATE VIEW patients_51_mlinid0_ctid AS (
	SELECT 

### Full example of the entire code generated using "VIEW" + "MATERIALIZE":

In [6]:
# The code generated using "VIEW" and "MATERIALIZED":
setup_code_m, test_code_m = get_sql_query(pipeline_code, mode="VIEW", materialize=True)
view_with_mat = setup_code_m + "\n" + test_code_m
print(view_with_mat)




Just translation to SQL is performed! 
-> SQL-Code placed at: mlinspect/to_sql/generated_code.sql

Mean accuracy: 
CREATE TABLE patients_101_mlinid0 (
	"id" INT,
	"first_name" VARCHAR(100),
	"last_name" VARCHAR(100),
	"race" VARCHAR(100),
	"county" VARCHAR(100),
	"num_children" INT,
	"income" FLOAT,
	"age_group" VARCHAR(100),
	"ssn" VARCHAR(100)
);

COPY patients_101_mlinid0("id", "first_name", "last_name", "race", "county", "num_children", "income", "age_group", "ssn") FROM '/home/luca/Documents/Bachelorarbeit/mlinspect/example_pipelines/healthcare/patients.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);


CREATE TABLE histories_102_mlinid1 (
	"smoker" VARCHAR(100),
	"complications" INT,
	"ssn" VARCHAR(100)
);

COPY histories_102_mlinid1("smoker", "complications", "ssn") FROM '/home/luca/Documents/Bachelorarbeit/mlinspect/example_pipelines/healthcare/histories.csv' WITH (DELIMITER ',', NULL '?', FORMAT CSV, HEADER TRUE);



CREATE VIEW patients_101_mlinid0_ctid AS (
	SE