In [4]:
from sqlanalyzer import column_parser
import pandas as pd

## given db metadata

In [2]:
db_fields = pd.DataFrame({'db_table': 'sfdc.accounts', 
            'all_columns': ['account_health_c', 'account_health_flag_c', 'account_health_last_touch_c', 'account_notes_c', 'account_owner_c', 'account_owner_id_c', 'account_segment_c', 'account_source', 'account_start_date_c', 'account_tier_c', 'add_company_tags_single_c', 'annual_revenue', 'billing_city', 'billing_country', 'billing_postal_code', 'billing_state', 'billing_street', 'churned_date_c', 'created_by_id', 'created_date', 'crunchbase_funding_c', 'csm_c', 'customer_tier_c', 'domain_c', 'dscorgpkg_lead_source_c', 'dscorgpkg_naics_codes_c', 'dscorgpkg_sic_codes_c', 'finance_arr_c', 'github_issue_ticket_c', 'health_update_c', 'id', 'industry', 'industry_group_c', 'industry_sector_c', 'initial_deal_arr_c', 'initial_deal_date_c', 'is_deleted', 'last_activity_date', 'last_modified_date', 'lfbn_account_domain_c', 'lost_opportunities_c', 'lost_renewals_c', 'mapbox_username_c', 'naics_code_c', 'name', 'netsuite_conn_channel_tier_c', 'next_renewal_date_c', 'number_of_employees', 'number_of_mapbox_users_c', 'open_opportunities_c', 'open_renewals_c', 'owner_id', 'owner_role_c', 'parent_id', 'partner_status_c', 'partner_type_c', 'primary_contact_c', 'primary_use_case_c', 'rating', 'record_type_id', 'region_c', 'renewal_manager_c', 'sb_pf_company_c', 'sdr_c', 'segmentation_c', 'shipping_city', 'shipping_country', 'shipping_postal_code', 'shipping_state', 'shipping_street', 'sic', 'solution_engineer_c', 'sub_industry_c', 'sub_region_c', 'support_engineer_c', 'type', 'vertical_c', 'vertical_formula_c', 'won_opportunities_c', 'x18_digit_account_id_c', 'zendesk_result_c', 'zendesk_zendesk_organization_c', 'zendesk_zendesk_organization_id_c', 'zisf_zoominfo_industry_c', 'dt']})
db_fields

Unnamed: 0,db_table,all_columns
0,sfdc.accounts,account_health_c
1,sfdc.accounts,account_health_flag_c
2,sfdc.accounts,account_health_last_touch_c
3,sfdc.accounts,account_notes_c
4,sfdc.accounts,account_owner_c
...,...,...
80,sfdc.accounts,zendesk_result_c
81,sfdc.accounts,zendesk_zendesk_organization_c
82,sfdc.accounts,zendesk_zendesk_organization_id_c
83,sfdc.accounts,zisf_zoominfo_industry_c


## given query

In [21]:
query = """SELECT "custom_sql_query"."account" AS "account", "custom_sql_query"."account_id" AS "account_id", "custom_sql_query"."account_name" AS "account_name", "custom_sql_query"."accountlevel" AS "accountlevel", "custom_sql_query"."authorization_id" AS "authorization_id", "custom_sql_query"."creferrer" AS "creferrer", "custom_sql_query"."csm_c" AS "csm_c", "custom_sql_query"."domain_creferrer" AS "domain_creferrer", CAST("custom_sql_query"."dt" AS DATE) AS "dt", "custom_sql_query"."email" AS "email", "custom_sql_query"."name" AS "name", "custom_sql_query"."note" AS "note", "custom_sql_query"."num_requests" AS "num_requests", "custom_sql_query"."resource" AS "resource", "custom_sql_query"."service" AS "service", "custom_sql_query"."token" AS "token", "custom_sql_query"."user_id" AS "user_id" FROM ( WITH mapbox_customer_data as ( SELECT id as user_id , email , accountlevel FROM mapbox_customer_data.accounts WHERE cast(dt as DATE) = CURRENT_DATE - INTERVAL '1' DAY ), mbx_acct as ( select mapbox_account_id as user_id , salesforce_account_id as account_id from sfdc.mapbox_accounts where cast(dt as DATE) = CURRENT_DATE - INTERVAL '1' DAY ), sfdc_acct as ( select id as account_id , name as account_name , csm_c from sfdc.accounts where cast(dt as DATE) = CURRENT_DATE - INTERVAL '1' DAY ), tam_name as ( select id as csm_c , name from sfdc.users where cast(dt as DATE) = CURRENT_DATE - INTERVAL '1' DAY ), tam_mapping as ( select a.user_id , a.email , a.accountlevel , b.account_id , c.account_name , c.csm_c , d.name FROM mapbox_customer_data a LEFT JOIN mbx_acct b ON a.user_id = b.user_id LEFT JOIN sfdc_acct c on b.account_id = c.account_id LEFT JOIN tam_name d on c.csm_c = d.csm_c ), raw_data as ( select * from analytics.token_style_referrer_agg where cast(dt as DATE) >= CURRENT_DATE - INTERVAL '14' DAY ) select a.* , b.* from raw_data a left join tam_mapping b on a.account = b.user_id ) "custom_sql_query" LIMIT 1000
"""

## transformation 1: format query

In [22]:
formatter = column_parser.Parser(query)

In [23]:
formatted = formatter.format_query(query)
print(formatted)

SELECT custom_sql_query.account AS account,
       custom_sql_query.account_id AS account_id,
       custom_sql_query.account_name AS account_name,
       custom_sql_query.accountlevel AS accountlevel,
       custom_sql_query.authorization_id AS authorization_id,
       custom_sql_query.creferrer AS creferrer,
       custom_sql_query.csm_c AS csm_c,
       custom_sql_query.domain_creferrer AS domain_creferrer,
       CAST(custom_sql_query.dt AS DATE) AS dt,
       custom_sql_query.email AS email,
       custom_sql_query.name AS name,
       custom_sql_query.note AS note,
       custom_sql_query.num_requests AS num_requests,
       custom_sql_query.resource AS RESOURCE,
       custom_sql_query.service AS service,
       custom_sql_query.token AS token,
       custom_sql_query.user_id AS user_id
FROM
  (WITH mapbox_customer_data AS
     (SELECT id AS user_id,
             email,
             accountlevel
      FROM mapbox_customer_data.accounts
      WHERE cast(dt AS DATE) = CURRENT_DATE

## transformation 2: separate CTE's

In [24]:
cte_query = formatter.parse_cte(formatted)
cte_query

{'(WITH mapbox_customer_data': "  (WITH mapbox_customer_data AS\n     (SELECT id AS user_id,\n             email,\n             accountlevel\n      FROM mapbox_customer_data.accounts\n      WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ),\n",
 'mbx_acct': "        mbx_acct AS\n     (SELECT mapbox_account_id AS user_id,\n             salesforce_account_id AS account_id\n      FROM sfdc.mapbox_accounts\n      WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ),\n",
 'sfdc_acct': "        sfdc_acct AS\n     (SELECT id AS account_id,\n             name AS account_name,\n             csm_c\n      FROM sfdc.accounts\n      WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ),\n",
 'tam_name': "        tam_name AS\n     (SELECT id AS csm_c,\n             name\n      FROM sfdc.users\n      WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ),\n",
 'tam_mapping': '        tam_mapping AS\n     (SELECT a.user_id,\n             a.email,\n             a.accountlevel,\n

In [26]:
cte_query.keys()

dict_keys(['(WITH mapbox_customer_data', 'mbx_acct', 'sfdc_acct', 'tam_name', 'tam_mapping', 'main', 'raw_data'])

In [15]:
formatted = formatter.format_query(cte_query['main'])
cte_query = formatter.parse_cte(formatted)
cte_query

{'main': "SELECT *\nFROM\n  (SELECT *\n   FROM\n     (SELECT date_trunc('month', cast(opp_created_date AS DATE)) AS attr_month,\n             marketing_channel,\n             TYPE,\n             sum(CASE\n                     WHEN category = 'Won' THEN campaign_money_share\n                     ELSE 0\n                 END) AS won_money,\n             sum(CASE\n                     WHEN category = 'Pipeline' THEN campaign_money_share\n                     ELSE 0\n                 END) AS pipeline_money\n      FROM wbr.marketing_campaign_attribution\n      WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY\n        AND marketing_attribution_type = 'Generated'\n        AND campaign_type = 'Web (WEB)'\n      GROUP BY 1,\n               2,\n               3) custom_sql_query\n   LIMIT 0) T\nLIMIT 0"}

In [19]:
formatted = formatter.format_query(cte_query['main'])
cte_query = formatter.parse_cte(formatted)
cte_query

{'main': "SELECT *\nFROM\n  (SELECT *\n   FROM\n     (SELECT date_trunc('month', cast(opp_created_date AS DATE)) AS attr_month,\n             marketing_channel,\n             TYPE,\n             sum(CASE\n                     WHEN category = 'Won' THEN campaign_money_share\n                     ELSE 0\n                 END) AS won_money,\n             sum(CASE\n                     WHEN category = 'Pipeline' THEN campaign_money_share\n                     ELSE 0\n                 END) AS pipeline_money\n      FROM wbr.marketing_campaign_attribution\n      WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY\n        AND marketing_attribution_type = 'Generated'\n        AND campaign_type = 'Web (WEB)'\n      GROUP BY 1,\n               2,\n               3) custom_sql_query\n   LIMIT 0) T\nLIMIT 0"}

## transformation 3: match table aliases

In [11]:
table_alias_mapping = formatter.get_table_names(formatted.split('\n'))
table_alias_mapping

{'wbr.marketing_campaign_attribution': 'wbr.marketing_campaign_attribution'}

## transformation 4: find columns

In [8]:
formatter.match_queried_fields(query, db_fields)

[{'database_name': 'sfdc', 'table_name': 'accounts', 'column_name': 'name'},
 {'database_name': 'sfdc', 'table_name': 'accounts', 'column_name': 'dt'},
 {'database_name': 'sfdc', 'table_name': 'accounts', 'column_name': 'id'}]

## bonus transformation: upload other query metadata (such as timestamp, user)