# Performance showcase of added "to_sql" functionality in mlinspect

Here the performance of the proposed inspection using sql will be compared to the original one in pandas. Part of
the "healthcare" and "compas" pipeline will be used.

## Required packages:
See: requirements/requirements.txt and requirements/requirements.dev.txt

## Some parameters you might want to set:

In [1]:
import os
import sys
import time

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
from inspect import cleandoc
from mlinspect.utils import get_project_root
from mlinspect import PipelineInspector, OperatorType
from mlinspect.inspections import HistogramForColumns, RowLineage, MaterializeFirstOutputRows
from mlinspect.checks import NoBiasIntroducedFor, NoIllegalFeatures
from demo.feature_overview.no_missing_embeddings import NoMissingEmbeddings
from example_pipelines.healthcare import custom_monkeypatching
from mlinspect.to_sql.dbms_connectors.postgresql_connector import PostgresqlConnector

from mlinspect.to_sql.dbms_connectors.umbra_connector import UmbraConnector


# DBMS related:
UMBRA_USER = "postgres"
UMBRA_PW = ""
UMBRA_DB = ""
UMBRA_PORT = 5433
UMBRA_HOST = "/tmp/"

POSTGRES_USER = "luca"
POSTGRES_PW = "password"
POSTGRES_DB = "healthcare_benchmark"
POSTGRES_PORT = 5432
POSTGRES_HOST = "localhost"

pipe = cleandoc("""
    import warnings
    import os
    import pandas as pd
    from sklearn.compose import ColumnTransformer
    from sklearn.impute import SimpleImputer
    from sklearn.pipeline import Pipeline
    from mlinspect.utils import get_project_root

    taxi = pd.read_csv(
        os.path.join( str(get_project_root()), "example_pipelines", "taxi", "yellow_tripdata_202101_head.csv"),
        na_values='?')

    taxi = taxi[(taxi['passenger_count']>=1)]
""")
BIAS = ['passenger_count']

## Benchmark setup:

In [2]:
def run_inspection(code, bias, to_sql, dbms_connector=None, mode=None, materialize=None):
    from PIL import Image
    import matplotlib.pyplot as plt
    from mlinspect.visualisation import save_fig_to_path

    inspector_result = PipelineInspector \
        .on_pipeline_from_string(code) \
        .add_custom_monkey_patching_module(custom_monkeypatching) \
        .add_check(NoBiasIntroducedFor(bias))

    if to_sql:
        inspector_result = inspector_result.execute_in_sql(dbms_connector=dbms_connector, mode=mode,
                                                           materialize=materialize)
    else:
        inspector_result = inspector_result.execute()

    check_results = inspector_result.check_to_check_results
    no_bias_check_result = check_results[NoBiasIntroducedFor(bias)]

    distribution_changes_overview_df = NoBiasIntroducedFor.get_distribution_changes_overview_as_df(
        no_bias_check_result)
    result = ""
    result += distribution_changes_overview_df.to_markdown()

    for i in list(no_bias_check_result.bias_distribution_change.items()):
        _, join_distribution_changes = i
        for column, distribution_change in join_distribution_changes.items():
            result += "\n"
            result += f"\033[1m Column '{column}'\033[0m"
            result += distribution_change.before_and_after_df.to_markdown()

    print(result)
    return result

## Benchmark of default inspection using CTEs:

In [3]:
dbms_connector_u = UmbraConnector(dbname=UMBRA_DB, user=UMBRA_USER, password=UMBRA_PW, port=UMBRA_PORT, host=UMBRA_HOST)

dbms_connector_p = PostgresqlConnector(dbname=POSTGRES_DB, user=POSTGRES_USER, password=POSTGRES_PW,
                                       port=POSTGRES_PORT, host=POSTGRES_HOST)

def run_for_all(code, bias, mode="", materialize=None):
    t0 = time.time()
    #run_inspection(code=code, bias=bias, to_sql=False)
    t1 = time.time()
    print("\nTime spend with original (pandas): " + str(t1 - t0))

    t0 = time.time()
    run_inspection(code=code, bias=bias, to_sql=True, dbms_connector=dbms_connector_p, mode=mode,
                   materialize=materialize)
    t1 = time.time()
    print("\nTime spend with modified SQL inspections (PSQL): " + str(t1 - t0))

    if not materialize: # Materialized not supported by Umbra -> main-memory performance
        t0 = time.time()
        run_inspection(code=code, bias=bias, to_sql=True, dbms_connector=dbms_connector_u, mode=mode,
                       materialize=materialize)
        t1 = time.time()
        print("\nTime spend with modified SQL inspections (Umbra): " + str(t1 - t0))


## End-to-End example of the preprocessing-pipeline inspection + model training:

Slightly different inspections results are expected because of the random split. Still, the resulting model accuracy should
be similar.

In [4]:
run_for_all(pipe, BIAS, mode="VIEW", materialize=False)


Time spend with original (pandas): 2.384185791015625e-07


2022-04-21 11:56:38.540932: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/lib64
2022-04-21 11:56:38.540972: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


|    | operator_type          | description                                          | code_reference                                                           | source_code                        | module                                                                | 'passenger_count' distribution change below the configured minimum test threshold   |
|---:|:-----------------------|:-----------------------------------------------------|:-------------------------------------------------------------------------|:-----------------------------------|:----------------------------------------------------------------------|:------------------------------------------------------------------------------------|
|  0 | OperatorType.SELECTION | Select by Series: taxi[(taxi['passenger_count']>=1)] | CodeReference(lineno=13, col_offset=7, end_lineno=13, end_col_offset=41) | taxi[(taxi['passenger_count']>=1)] | FunctionInfo(module='pandas.core.frame', function_name='__getitem__') | True           