## Application Steps (Snowflake Data Profile example)

#### 1. Initalize application by loading common libraries

In [None]:
import os, sys

import dtale
from ydata_profiling import ProfileReport

## Find path of the script then find the path app_run and add it to system path ##
# path_script = os.path.abspath(__file__)
# path_app_run = os.path.dirname(os.path.dirname(path_script))
path_script_dir = os.getcwd()
path_app_run = os.path.dirname(path_script_dir)

sys.path.append(path_app_run)

## use common functions to initalize global variable and set logger ##
import utilities.common_functions as cf
import queries.sf_queries as sfq

loggername = 'example_sf_connection'
logger = cf.initialize(path_app_run, loggername)

#### 2. Connect to database

In [None]:
sso = False

sf_account = cf.gvar.sf_account
sf_user = cf.gvar.sf_username
sf_role = cf.gvar.sf_app_role
sf_wh = cf.gvar.sf_app_wh

print(
    'sf_account:  ' + sf_account +
    '\nsf_user:     ' + sf_user +
    '\nsf_role:     ' + sf_role +
    '\nsf_wh:       ' + sf_wh)

if sso:
    cf.connect_snowflake_sso(sf_user, sf_role, sf_wh)
else:
    cf.connect_snowflake_login(sf_user, sf_role, sf_wh)

#### 3. Query information schema data

In [None]:
## functions used to get list of databases, schemas and tables from snowflake ##
def list_sf_databases(sf_role, sf_wh):
    cf.gvar.sf_conn.execute_string(sfq.use_role_wh.format(sf_role=sf_role, sf_wh=sf_wh), return_cursors=False)
    df = cf.sf_exec_query_return_df(sfq.show_databases)
    list_dbs = df['name'].tolist()
    list_dbs.sort()
    return list_dbs

def list_sf_schemas(db):
    df = cf.sf_exec_query_return_df(sfq.list_schemas.format(db_name=db))
    list_schemas = df['TABLE_SCHEMA'].tolist()
    list_schemas.sort()
    return list_schemas

def list_sf_tables(db, schema):
    df = cf.sf_exec_query_return_df(sfq.list_tables.format(db_name=db, schema=schema))
    list_tables = df['TABLE_NAME'].tolist()
    list_tables.sort()
    return list_tables

def list_sf_columns(db, schema, table):
    df = cf.sf_exec_query_return_df(sfq.get_columns.format(db_name=db, schema=schema, table=table))
    df.sort_values(by='ORDINAL_POSITION', inplace=True)
    list_columns = df['COLUMN_NAME'].tolist()
    return list_columns

list_dbs = list_sf_databases(sf_role, sf_wh)
db = list_dbs[0]

list_schemas = list_sf_schemas(db)
schema = list_schemas[0]

list_tables = list_sf_tables(db, schema)
table = list_tables[0]

list_columns = list_sf_columns(db, schema, table)
str_columns = ', '.join(list_columns)

print(
    'database:    ' + db +
    '\nschema:      ' + schema +
    '\ntable:       ' + table +
    '\ncolumns:     ' + str_columns)

#### 4. Generate SQL statement to send

In [None]:
def generate_sql_base(db, schema, table, str_columns):
    sql = f'select  {str_columns}\nfrom  {db}.{schema}.{table}'
    return sql

sql = generate_sql_base(db, schema, table, str_columns)
print(sql)

#### 5. Send SQL to query data and store result into pandas DataFrame

In [None]:
def return_sf_query_df(sql):
    df = cf.sf_exec_query_return_df(sql)
    return df

df = return_sf_query_df(sql)

print('\nDisplaying First 10 rows of queried result:')
df[:10]

#### 6. Profile DataFrame using dtale

In [None]:
print('Starting D-Tale')
d = dtale.show(df, host='localhost')
d.open_browser()
dtale_running = True

#### 7. Profile DataFrame using ydata-profiling

In [None]:
if dtale_running:
    print('D-Tale is running. Shutting down..')
    d.kill()
    dtale_running = False

def profile_data_ydata(df):
    pr = ProfileReport(df,
                    title=db + '.' + schema + '.' + table,
                    minimal=True,
                    explorative=False,
                    correlations=None,
                    infer_dtypes=False,
                    vars={
                        "num": {"low_categorical_threshold": 0},
                        "cat": {
                            "length": True,
                            "characters": False,
                            "words": False,
                            "n_obs": 10,
                        },
                    },
                    orange_mode=True)
    pr.to_notebook_iframe()

profile_data_ydata(df)