In [36]:
from sqlanalyzer import column_parser, unbundle
import sqlparse
import re
import json
import pandas as pd
import time


def flatten_subquery(final_list, sub_queries, level_num):
    
    for q in sub_queries:
        for alias,query in q.items():
            formatter = column_parser.Parser(query)
            formatted_query = formatter.format_query(query)
            unbundled = unbundle.Unbundle(formatted_query)
            query_dict = {}
            if unbundled.has_child(query):
                if alias == 'no alias' or alias == '' or alias == 'query':
                    query_dict, sub_queries = unbundled.restructure_subquery(query_dict, 'level_{}_main'.format(level_num), formatted_query)
                else:
                    query_dict, sub_queries = unbundled.restructure_subquery(query_dict, alias, formatted_query)
#             else: 
#                 sub_queries = []

        if query_dict != {}:
            final_list.append(query_dict)

        for subq in sub_queries:
            for _, sub_query in subq.items():
                if not unbundled.has_child(sub_query): 
                    final_list.append(subq)
                    sub_queries.remove(subq)

    return final_list, sub_queries


def is_cte(query):
    return query.startswith('WITH')


def flatten_pure_nested(query):

    sub_queries = [{'query': query}]
    final_list = []
    i = 0

    while sub_queries != []:
        i += 1
        final_list, sub_queries = flatten_subquery(final_list, sub_queries, level_num=i)

    return final_list

## type A: subquery ( subqueries ) 

SELECT FROM (SELECT FROM (...))

In [46]:
query = """SELECT *
   FROM
     (SELECT a.*,
             b.*,
             c.*,
             d.*
      FROM
        (SELECT DISTINCT anonymous_id,
                         user_id
         FROM mapbox_customer_data.segment_identifies
         WHERE dt >= '2018-07-01'
           AND anonymous_id IS NOT NULL
           AND user_id IS NOT NULL ) a
      LEFT JOIN
        (SELECT id,
                email,
                created
         FROM mapbox_customer_data.accounts
         WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON a.user_id = b.id
      LEFT JOIN
        (SELECT anonymous_id AS anon_id_ad,
                context_campaign_name,
                min(TIMESTAMP) AS min_exposure
         FROM mapbox_customer_data.segment_pages
         WHERE dt >= '2018-07-01'
           AND context_campaign_name IS NOT NULL
         GROUP BY 1,
                  2) c ON a.anonymous_id = c.anon_id_ad
      LEFT JOIN
        (SELECT DISTINCT anonymous_id AS anon_id_event,
                         original_timestamp,
                         event,
                         context_traits_email
         FROM mapbox_customer_data.segment_tracks
         WHERE dt >= '2018-07-01'
           AND event LIKE 'submitted_%form'
           AND context_traits_email IS NOT NULL ) d ON a.anonymous_id = d.anon_id_event
    LEFT JOIN
        (SELECT sfdc_accounts.platform, sfdc_accounts.mobile_os, sfdc_accounts.service_metadata,
sfdc_cases.account, sfdc_cases.num_requests, sfdc_cases.owner, sfdc_accounts.user_id
FROM sfdc.accounts sfdc_accounts
LEFT JOIN 
(SELECT MAX(dt) FROM 
    (SELECT dt 
    FROM sfdc.oppty 
    LEFT JOIN (SELECT MAX(dt) FROM (SELECT DISTINCT dt FROM sfdc.owner AS sfdc_owner) AS dt_owner ON sfdc_oppty.dt = sfdc_cases.dt)
    LEFT JOIN (SELECT dt FROM sfdc.cases) sfdc_cases ON sfdc_oppty.dt = sfdc_cases.dt) )
AS sfdc_cases_oppty ON sfdc_cases_oppty.dt = sfdc_accounts.dt
LEFT JOIN sfdc.cases AS sfdc_cases ON sfdc_cases.id = sfdc_accounts.case_id
WHERE sfdc_cases_oppty.dt > '2020-04-03' AND sfdc_cases_oppty.dt < '2020-05-04' ORDER BY 1 GROUP BY 3 LIMIT 20
        ) e ON e.user_id = a.user_id
        )
   WHERE context_campaign_name IS NOT NULL 
"""


In [47]:
flatten_pure_nested(query)

[{'query': "SELECT\n    \topp.id AS opp_id\n    \t,opp.name AS opp_name\n    \t,opp.close_date AS opp_close_date\n    \t,opp.mapbox_username_c AS mbx_acct_id\n    \t,mbxacct.customerid AS stripe_cust_id\n    \t,opp.account_id AS sfdc_acct_id\n    \t,acct.name AS sfdc_acct_name\n    \t,oprd.id AS prd_id\n    \t,oprd.product_2_id AS prd_2_id\n    \t,oprd.product_name_c AS prd_name\n    \t,oprd.service_date AS prd_start_date\n    \t,oprd.end_date_c AS prd_end_date\n    \t,oprd.unit_price AS prd_volume_price\n    \t,oprd.product_value_c AS prd_value\n    FROM\n    \tsfdc.opportunities opp\n    LEFT JOIN\n    \tsfdc.accounts acct\n    \tON opp.account_id = acct.id\n    \tAND acct.dt = '{run_date}'\n    LEFT JOIN\n    \tsfdc.opportunity_product oprd\n    \tON opp.id = oprd.opportunity_id\n    \tAND oprd.dt = '{run_date}'\n    LEFT JOIN\n    \tmapbox_customer_data.accounts mbxacct\n    \tON LOWER(opp.mapbox_username_c) = mbxacct.id\n    \tAND mbxacct.dt = '{run_date}'\n    WHERE\n    \t--opp.

## type B: CTE ( subqueries )

WITH a AS ()...

In [61]:
query = """WITH opportunity_product_rev AS
    (
      SELECT -- get opp products
      opportunity_id,
      op.product_2_id,
      -- REGEXP_REPLACE(product_name_c, '^\\\\*', '') AS product_name, -- remove asterisk in some names
      p.name AS product_name,
      COALESCE(CAST(quantity AS DOUBLE) * CAST(list_price AS DOUBLE), 0) AS total_price,
      netsuite_conn_netsuite_item_key_id_c AS netsuite_conn_net_suite_item_key_id_c,
      COUNT(netsuite_conn_netsuite_item_key_id_c) OVER (PARTITION BY opportunity_id) AS num_netsuite_items
      FROM
      sfdc.opportunity_product op
      LEFT JOIN
      sfdc.products p
      ON
      -- op.product_2_id = p.id AND p.dt = '2019-05-21'
      op.product_2_id = p.id AND p.dt = '{run_date}'
      WHERE
      -- op.dt = '2019-05-21'
      op.dt = '{run_date}'
      AND
      opportunity_id IS NOT NULL
      AND
      product_name_c IS NOT NULL
      AND
      op.is_deleted = false
      -- AND
      -- line_item_id_c IS NOT NULL
    ),
    product_mapping AS
    (
      SELECT -- exclude odd extraneous products and manually change allocation for select deals
      opportunity_id,
      op.product_name,
      COALESCE(mapped_product, 'unmapped') AS mapped_product,
      CASE
        WHEN op.opportunity_id = '0063600000B1icTAAR' AND op.product_2_id = '01t36000001SzdWAAS' THEN 0.0 --Search
        WHEN op.opportunity_id = '00636000004JKS0AAO' AND op.product_2_id = '01t36000000iDrwAAE' THEN 42.0 --Maps
        WHEN op.opportunity_id = '00636000004JKS0AAO' AND op.product_2_id = '01t36000001SzdWAAS' THEN 58.0 --Search
        WHEN op.opportunity_id = '00636000006IgpvAAC' AND op.product_2_id = '01t36000000iDrwAAE' THEN 9.0 --Maps
        WHEN op.opportunity_id = '00636000006IgpvAAC' AND op.product_2_id = '01t36000001SzdWAAS' THEN 91.0 --Search
        WHEN op.opportunity_id = '00636000008xsoWAAQ' AND op.product_2_id = '01t360000007IMZAA2' THEN 3.0 --Maps
        WHEN op.opportunity_id = '00636000008xsoWAAQ' AND op.product_2_id = '01t36000000iDrwAAE' THEN 79.5 --Maps
        WHEN op.opportunity_id = '00636000008xsoWAAQ' AND op.product_2_id = '01t36000001SzdWAAS' THEN 17.5 --Search
        WHEN op.opportunity_id = '00636000008xsoWAAQ' AND op.product_2_id = '01t36000003Mn0JAAS' THEN 0.0 --Search
        WHEN op.opportunity_id = '0063600000IH8B9AAL' AND op.product_2_id = '01t360000005tjBAAQ' THEN 0.2 --Maps
        WHEN op.opportunity_id = '0063600000IH8B9AAL' AND op.product_2_id = '01t360000007IMZAA2' THEN 9.6 --Maps
        WHEN op.opportunity_id = '0063600000IH8B9AAL' AND op.product_2_id = '01t36000001SzdWAAS' THEN 61.4 --Search
        WHEN op.opportunity_id = '0063600000IH8B9AAL' AND op.product_2_id = '01t36000004WEfoAAG' THEN 28.3 --Maps
        WHEN op.opportunity_id = '0063600000IH8B9AAL' AND op.product_2_id = '01t36000004OtprAAC' THEN 0.5 --Support
        WHEN op.opportunity_id = '0061R00000ogYPlQAM' AND op.product_2_id = '01t36000005vWnJAAU' THEN 13.33 --Maps
        WHEN op.opportunity_id = '0061R00000ogYPlQAM' AND op.product_2_id = '01t36000005vg85AAA' THEN 66.67 --Data Services
        WHEN op.opportunity_id = '0061R00000ogYPlQAM' AND op.product_2_id = '01t36000005vX1wAAE' THEN 20.0 --Support
        ELSE total_price END AS list_price_value
      FROM
      opportunity_product_rev op
      LEFT JOIN
      (
        SELECT
        name AS product_name,
        COALESCE(MAX(service_organization_c), 'unmapped') AS mapped_product,
        COALESCE(MAX(service_organization_c), 'unmapped') AS mid_product,
        COALESCE(MAX(sku_id_c), 'unknown') AS endpoint
        FROM
        sfdc.products
        WHERE
        dt = '{run_date}'
        GROUP BY
        name
      ) sm
      ON
      op.product_name = sm.product_name
      WHERE
      total_price > 0 -- zero out negative values
      AND
      (
        netsuite_conn_net_suite_item_key_id_c IS NOT NULL -- one or more products has a netsuite key
        OR
        num_netsuite_items = 0 -- no products have a netsuite key
      )
    ),
    pricing_by_method AS
    (
      SELECT -- sum up values in case there are duplicate product names
      opportunity_id,
      mapped_product,
      product_name,
      SUM(list_price_value) AS list_price_value
      FROM
      product_mapping
      GROUP BY
      opportunity_id,
      mapped_product,
      product_name
    ),
    opportunity_to_name AS
    (
      SELECT  -- make sure there is only one name per id
      id AS account_id,
      CONCAT_WS(',', COLLECT_SET(LOWER(name))) AS account_name
      FROM
      sfdc.accounts
      WHERE
      dt = '{run_date}'
      -- dt = '2019-05-21'
      GROUP BY
      id
    ),
    opportunity_arr_tmp_org AS
    (
      SELECT -- get opportunities
      so.account_id,
      COALESCE(account_name, so.account_id) AS account_name,
      id AS opportunity_id,
      prior_opportunity_c AS prior_opportunity_id,
      DATE_FORMAT(service_start_date_c, 'yyyy-MM-dd') AS service_start_day,
      DATE_FORMAT(CASE WHEN stage_name NOT IN ('Won', '7 - ICR', 'Won - Pending') AND service_start_date_c IS NOT NULL THEN service_start_date_c ELSE effective_date_c END, 'yyyy-MM-dd') AS effective_day, -- change effective date to service start for non-won contracts
      CASE
        WHEN service_end_date_c IS NULL THEN ADD_MONTHS(DATE_FORMAT(service_start_date_c, 'yyyy-MM-dd'), 12)
        ELSE DATE_FORMAT(service_end_date_c, 'yyyy-MM-dd') END AS service_end_day,
      stage_name,
      type,
      COALESCE(CAST(arr_c AS DOUBLE), 0) AS arr
      FROM
      sfdc.opportunities so
      LEFT JOIN
      opportunity_to_name otn
      ON
      so.account_id = otn.account_id
      WHERE
      dt = '{run_date}'
      -- dt = '2019-05-21'
      AND
      service_start_date_c IS NOT NULL
      AND
      so.account_id IS NOT NULL -- exclude ones that do not have an account name
      AND
      is_deleted = false
      AND
      (
        stage_name IN ('Won', '7 - ICR', 'Won - Pending') -- only count Won for ARR calculations, maybe stage-7
        OR
        (type = 'Renewal Business' AND stage_name NOT IN ('Lost', 'Dead', 'Closed - No Decision'))
      )
      AND
      non_enterprise_c = false -- only count enterprise
      AND
      (
        service_end_date_c IS NULL -- there is no end date or the end date is in a different month following the effective month
        OR
        -- CAST(DATE_TRUNC('month', service_end_date_c) AS DATE) > CAST(DATE_TRUNC('month', effective_date_c) AS DATE)
        -- DATE_FORMAT(service_end_date_c, 'yyyy-MM') > DATE_FORMAT(effective_date_c, 'yyyy-MM')
        DATE_FORMAT(service_end_date_c, 'yyyy-MM') > DATE_FORMAT(service_start_date_c, 'yyyy-MM')
      )
      AND
      COALESCE(CAST(arr_c AS DOUBLE), 0) > 0 -- positive arr only
    ),
    opportunity_arr_tmp AS
    (
      SELECT -- restrict non-won renewals to be within 45 days of the snapshot date if the previous contract has already ended
      a.account_id,
      a.account_name,
      a.opportunity_id,
      a.prior_opportunity_id,
      a.service_start_day,
      a.effective_day,
      a.service_end_day,
      a.stage_name,
      a.type,
      CASE WHEN a.stage_name IN ('Won', '7 - ICR', 'Won - Pending') THEN a.arr ELSE b.arr END AS arr
      FROM
      opportunity_arr_tmp_org a
      LEFT JOIN
      opportunity_arr_tmp_org b
      ON
      COALESCE(a.prior_opportunity_id, '') = b.opportunity_id -- must coalesce to prevent dropping
      WHERE
      a.stage_name IN ('Won', '7 - ICR', 'Won - Pending') -- keep all won opps
      OR
      (
        a.stage_name NOT IN ('Won', '7 - ICR', 'Won - Pending') -- not won renewal opp
        AND
        b.service_end_day >= DATE_SUB('{run_date}', 45) -- prior ends in future or ended within last 45 days
        -- b.service_end_day >= DATE_SUB('2019-05-20', 45) -- prior ends in future or ended within last 45 days
      )
    ),
    opportunity_arr AS
    (
      SELECT -- if a renewal occurs before the original ends, supercede the original with the renewal
      oa.account_id,
      oa.account_name,
      oa.opportunity_id,
      oa.prior_opportunity_id,
      oa.service_start_day,
      CASE WHEN oa.opportunity_id IN ('0063600000VQA62AAH', '00636000004NSbIAAW', '0063600000VwjuXAAR') THEN oa.service_start_day ELSE oa.effective_day END AS effective_day,
      CASE WHEN DATE_FORMAT(oaa.service_start_day, 'yyyy-MM-01') <= DATE_FORMAT(oa.service_end_day, 'yyyy-MM-01') THEN ADD_MONTHS(DATE_FORMAT(oaa.service_start_day, 'yyyy-MM-01'), -1) ELSE oa.service_end_day END AS service_end_day,
      CASE WHEN DATE_FORMAT(oaa.service_start_day, 'yyyy-MM-01') <= DATE_FORMAT(oa.service_end_day, 'yyyy-MM-01') THEN true ELSE false END AS is_superceded,
      oa.stage_name,
      oa.type,
      oa.arr
      FROM
      opportunity_arr_tmp oa -- former opp
      INNER JOIN
      opportunity_arr_tmp oaa
      ON
      -- oa.opportunity_id = oaa.prior_opportunity_id AND oaa.type = 'Renewal Business' AND oaa.prior_opportunity_id IS NOT NULL AND oa.account_id = oaa.account_id
      oa.opportunity_id = oaa.prior_opportunity_id AND oaa.type = 'Renewal Business' AND oaa.prior_opportunity_id IS NOT NULL
      UNION ALL
      SELECT
      a.account_id,
      a.account_name,
      a.opportunity_id,
      a.prior_opportunity_id,
      a.service_start_day,
      CASE WHEN a.opportunity_id IN ('0063600000VQA62AAH', '00636000004NSbIAAW', '0063600000VwjuXAAR') THEN a.service_start_day ELSE a.effective_day END AS effective_day,
      a.service_end_day,
      false AS is_superceded,
      a.stage_name,
      a.type,
      a.arr
      FROM
      opportunity_arr_tmp a -- former opp
      LEFT JOIN
      opportunity_arr_tmp b
      ON
      -- a.opportunity_id = b.prior_opportunity_id AND b.type = 'Renewal Business' AND b.prior_opportunity_id IS NOT NULL AND a.account_id = b.account_id
      a.opportunity_id = b.prior_opportunity_id AND b.type = 'Renewal Business' AND b.prior_opportunity_id IS NOT NULL
      WHERE
      b.opportunity_id IS NULL
    ),
    pricing_by_method_nw AS
    ( --  duplicate products of prior opportunity for non-won renewals
      SELECT -- won opps
      pm.opportunity_id,
      mapped_product,
      product_name,
      list_price_value
      FROM
      pricing_by_method pm
      INNER JOIN
      opportunity_arr oa
      ON
      pm.opportunity_id = oa.opportunity_id
      WHERE
      oa.stage_name IN ('Won', '7 - ICR', 'Won - Pending') -- won
      UNION ALL
      SELECT
      a.opportunity_id, -- non-won opp id
      mapped_product, -- previous opps values
      product_name,
      list_price_value
      FROM
      (
        SELECT
        opportunity_id,
        prior_opportunity_id
        FROM
        opportunity_arr
        WHERE
        stage_name NOT IN ('Won', '7 - ICR', 'Won - Pending') -- non-won opps
        AND
        prior_opportunity_id IS NOT NULL -- should always be true
      ) a
      INNER JOIN
      pricing_by_method pmm
      ON
      a.prior_opportunity_id = pmm.opportunity_id
    ),
    account_product AS
    (
      SELECT -- get all product associated with an account
      account_id,
      COALESCE(mapped_product, 'maps') AS mapped_product,
      COALESCE(product_name, 'unknown') AS product_name,
      -- MIN(effective_day) AS effective_day,
      MIN(service_start_day) AS service_start_day,
      MAX(service_end_day) AS service_end_day
      FROM
      opportunity_arr oa
      LEFT JOIN
      pricing_by_method_nw pm
      ON
      oa.opportunity_id = pm.opportunity_id
      GROUP BY
      account_id,
      mapped_product,
      product_name
    ),
    arr_by_month_dummy AS
    (
      SELECT -- get the months in between opps for all account, product name, month combinations
      am.account_id,
      COALESCE(account_name, am.account_id) AS account_name,
      mapped_product,
      product_name,
      year_month AS service_month
      FROM
      (
        SELECT
        year_month,
        'dummy' AS dummy
        FROM
        wbr.year_month_dummy_final
      ) ym
      INNER JOIN
      (
        SELECT
        account_id,
        mapped_product,
        product_name,
        MAX('dummy') AS dummy,
        MIN(DATE_FORMAT(service_start_day, 'yyyy-MM-01')) AS min_month, -- get first month and last month ever
        MAX(DATE_FORMAT(ADD_MONTHS(service_end_day, 1), 'yyyy-MM-01'))AS max_month  -- add month to the end
        FROM
        account_product
        GROUP BY
        account_id,
        mapped_product,
        product_name
      ) am
      ON
      ym.dummy = am.dummy
      LEFT JOIN
      opportunity_to_name otn
      ON
      am.account_id = otn.account_id
      WHERE
      year_month BETWEEN min_month AND max_month
    ),
    opportunity_product AS
    (
      SELECT -- join opps to products
      oa.*,
      COALESCE(product_name, 'unknown') AS product_name, -- coalesce to unknown
      COALESCE(CASE WHEN mapped_product IN ('bundled') THEN 'maps' ELSE mapped_product END, 'maps') AS mapped_product, -- change to maps
      COALESCE(list_price_value, 0) AS list_price_value,
      CASE WHEN product_name IS NOT NULL AND mapped_product NOT IN ('bundled') THEN COALESCE(list_price_value, 0) ELSE 0 END AS product_value, -- total non-bundled value
      COUNT(product_name) OVER (PARTITION BY oa.account_id, oa.opportunity_id) AS num_items, -- can be zero
      SUM(COALESCE(list_price_value, 0)) OVER (PARTITION BY oa.account_id, oa.opportunity_id) AS total_value,
      SUM(CASE WHEN product_name IS NOT NULL AND mapped_product NOT IN ('bundled') THEN COALESCE(list_price_value, 0) ELSE 0 END) OVER (PARTITION BY oa.account_id, oa.opportunity_id) AS total_product_value
      FROM
      opportunity_arr oa
      LEFT JOIN
      pricing_by_method_nw pm
      ON
      oa.opportunity_id = pm.opportunity_id
    ),
    opp_product_share AS -- attributes arr by product by opportunity based on share
    (
      SELECT
      DATE_FORMAT(service_start_day, 'yyyy-MM-01') AS service_start_month,
      DATE_FORMAT(service_end_day_r, 'yyyy-MM-01') AS service_end_month,
      a.*,
      arr * share AS arr_p
      FROM
      (
        SELECT
        *,
        CASE
          WHEN mapped_product = 'maps' AND product_name = 'unknown' AND num_items <= 1 THEN 1 -- attribute all non-matches to the unknown maps product
          WHEN (total_value > 0 AND total_product_value = 0) THEN list_price_value / total_value -- REVISED (no positive products)
          WHEN total_product_value > 0 THEN product_value / total_product_value -- products only
          ELSE 0 END AS share,
        service_end_day AS service_end_day_r
        FROM
        opportunity_product
      ) a
    ),
    arr_product_exp AS
    (
      SELECT -- join to expanded months table to get arr for in between months
      md.account_id,
      md.account_name,
      md.product_name,
      CASE WHEN md.mapped_product IN ('bundled') THEN 'maps' ELSE md.mapped_product END AS mapped_product, -- change to maps
      service_month,
      COLLECT_SET(opportunity_id) AS opportunity_id_s, -- distinct
      COLLECT_SET(CASE WHEN stage_name NOT IN ('Won', '7 - ICR', 'Won - Pending') THEN opportunity_id ELSE NULL END) AS opportunity_id_nw, -- distinct
      SUM(arr_p) AS arr_p
      FROM
      arr_by_month_dummy md
      LEFT JOIN
      opp_product_share ps
      ON
      md.account_id = ps.account_id AND md.product_name = ps.product_name AND md.service_month BETWEEN ps.service_start_month AND ps.service_end_month
      GROUP BY
      md.account_id,
      md.account_name,
      md.product_name,
      md.mapped_product,
      service_month
    ),
    arr_product_prev AS
    (
      SELECT -- get the previous opps value
      account_id,
      account_name,
      mapped_product,
      product_name,
      service_month,
      COALESCE(ROUND(arr_p, 2), 0) AS arr_p,
      COALESCE(LAG(ROUND(arr_p, 2)) OVER (PARTITION BY account_id, product_name ORDER BY service_month ASC), 0) AS prev_arr_p,
      opportunity_id_s AS opportunity_id,
      CONCAT_WS(',', opportunity_id_s) AS opportunity_id_p,
      LAG(CONCAT_WS(',', opportunity_id_s)) OVER (PARTITION BY account_id, product_name ORDER BY service_month ASC) AS prev_opportunity_id_p,
      CONCAT_WS(',', opportunity_id_nw) AS opportunity_id_nw
      FROM
      arr_product_exp
    ),
    arr_product_status AS
    (
      SELECT -- get the status at the opp product level
      account_id,
      account_name,
      mapped_product,
      product_name,
      service_month,
      CASE
        WHEN arr_p > 0 AND prev_arr_p = 0 THEN 'new'
        WHEN arr_p = 0 AND prev_arr_p > 0 THEN 'churn'
        WHEN arr_p > 0 AND arr_p < prev_arr_p THEN 'contraction'
        WHEN arr_p > 0 AND arr_p > prev_arr_p THEN 'expansion'
        WHEN arr_p > 0 AND arr_p = prev_arr_p AND opportunity_id_p <> prev_opportunity_id_p THEN 'renewal'
        WHEN arr_p > 0 AND arr_p = prev_arr_p AND opportunity_id_p = prev_opportunity_id_p THEN 'active'
        WHEN arr_p = 0 AND prev_arr_p = 0 THEN 'not_active'
        ELSE 'unknown' END AS product_status,
      arr_p,
      prev_arr_p,
      opportunity_id,
      opportunity_id_p,
      prev_opportunity_id_p,
      opportunity_id_nw
      FROM
      arr_product_prev
    ),
    arr_product_status_mid AS
    (
      SELECT -- create arr with midlevel product aggregations
      COALESCE(mid_product, 'unmapped') AS mid_product,
      ps.*
      FROM
      arr_product_status ps
      LEFT JOIN
      wbr.product_service_mapping sm
      ON
      ps.product_name = sm.product_name
    ),
    arr_account AS
    (
      SELECT -- get the status at the account level
      account_id,
      service_month,
      arr_a,
      prev_arr_a,
      CASE
        WHEN arr_a > 0 AND prev_arr_a = 0 THEN 'new'
        WHEN arr_a = 0 AND prev_arr_a > 0 THEN 'churn'
        WHEN arr_a > 0 AND arr_a < prev_arr_a THEN 'contraction'
        WHEN arr_a > 0 AND arr_a > prev_arr_a THEN 'expansion'
        WHEN arr_a > 0 AND arr_a = prev_arr_a AND opportunity_id_a <> prev_opportunity_id_a THEN 'renewal'
        WHEN arr_a > 0 AND arr_a = prev_arr_a AND opportunity_id_a = prev_opportunity_id_a THEN 'active'
        WHEN arr_a = 0 AND prev_arr_a = 0 THEN 'not_active'
        ELSE 'unknown' END AS account_status
      FROM
      (
        SELECT
        account_id,
        service_month,
        ROUND(arr_a, 2) AS arr_a,
        ROUND(COALESCE(LAG(arr_a) OVER (PARTITION BY account_id ORDER BY service_month ASC), 0), 2) AS prev_arr_a,
        opportunity_id_a,
        prev_opportunity_id_a
        FROM
        (
          SELECT
          account_id,
          service_month,
          SUM(arr_a) OVER (PARTITION BY account_id ORDER BY service_month ASC) AS arr_a,
          -- ARRAY_JOIN(opportunity_id_a, ',') AS opportunity_id_a,
          -- LAG(ARRAY_JOIN(opportunity_id_a, ',')) OVER (PARTITION BY account_id ORDER BY service_month ASC) AS prev_opportunity_id_a
          CONCAT_WS(',', opportunity_id_a) AS opportunity_id_a,
          LAG(CONCAT_WS(',', opportunity_id_a)) OVER (PARTITION BY account_id ORDER BY service_month ASC) AS prev_opportunity_id_a
          FROM
          (
            SELECT
            aa.account_id,
            aa.service_month,
            arr_a,
            opportunity_id_a
            FROM
            (
              SELECT
              account_id,
              service_month,
              SUM(COALESCE(arr_p, 0) - COALESCE(prev_arr_p, 0)) AS arr_a
              FROM
              arr_product_status_mid
              GROUP BY
              account_id,
              service_month
            ) aa
            LEFT JOIN
            (
              SELECT
              account_id,
              service_month,
              COLLECT_SET(opp) AS opportunity_id_a
              FROM
              arr_product_status_mid
              LATERAL VIEW EXPLODE (opportunity_id) t AS opp
              GROUP BY
              account_id,
              service_month
            ) bb
            ON
            aa.account_id = bb.account_id AND aa.service_month = bb.service_month
          ) a
        ) b
      ) c
    ),
    arr_mapped_product AS
    (
      SELECT -- get the status at the mapped product level
      account_id,
      mapped_product,
      service_month,
      arr_m,
      prev_arr_m,
      CASE
        WHEN arr_m > 0 AND prev_arr_m = 0 THEN 'new'
        WHEN arr_m = 0 AND prev_arr_m > 0 THEN 'churn'
        WHEN arr_m > 0 AND arr_m < prev_arr_m THEN 'contraction'
        WHEN arr_m > 0 AND arr_m > prev_arr_m THEN 'expansion'
        WHEN arr_m > 0 AND arr_m = prev_arr_m AND opportunity_id_m <> prev_opportunity_id_m THEN 'renewal'
        WHEN arr_m > 0 AND arr_m = prev_arr_m AND opportunity_id_m = prev_opportunity_id_m THEN 'active'
        WHEN arr_m = 0 AND prev_arr_m = 0 THEN 'not_active'
        ELSE 'unknown' END AS mapped_status
      FROM
      (
        SELECT
        account_id,
        mapped_product,
        service_month,
        ROUND(arr_m, 2) AS arr_m,
        ROUND(COALESCE(LAG(arr_m) OVER (PARTITION BY account_id, mapped_product ORDER BY service_month ASC), 0), 2) AS prev_arr_m,
        opportunity_id_m,
        prev_opportunity_id_m
        FROM
        (
          SELECT
          account_id,
          mapped_product,
          service_month,
          SUM(arr_m) OVER (PARTITION BY account_id, mapped_product ORDER BY service_month ASC) AS arr_m,
          CONCAT_WS(',', opportunity_id_m) AS opportunity_id_m,
          LAG(CONCAT_WS(',', opportunity_id_m)) OVER (PARTITION BY account_id, mapped_product ORDER BY service_month ASC) AS prev_opportunity_id_m
          FROM
          (
            SELECT
            aa.account_id,
            aa.service_month,
            aa.mapped_product,
            arr_m,
            opportunity_id_m
            FROM
            (
              SELECT
              account_id,
              mapped_product,
              service_month,
              SUM(COALESCE(arr_p, 0) - COALESCE(prev_arr_p, 0)) AS arr_m
              FROM
              arr_product_status_mid
              GROUP BY
              account_id,
              mapped_product,
              service_month
            ) aa
            LEFT JOIN
            (
              SELECT
              account_id,
              mapped_product,
              service_month,
              COLLECT_SET(opp) AS opportunity_id_m
              FROM
              arr_product_status_mid
              LATERAL VIEW EXPLODE (opportunity_id) t AS opp -- what if the opportunity_id array is just NULL due to the LAG, does this drop that row?
              GROUP BY
              account_id,
              mapped_product,
              service_month
            ) bb
            ON
            aa.account_id = bb.account_id AND aa.service_month = bb.service_month AND aa.mapped_product = bb.mapped_product
          ) a
        ) b
      ) c
    ),
    arr_mid_product AS
    (
      SELECT -- get the status at the mid level
      account_id,
      mid_product,
      service_month,
      arr_mid,
      prev_arr_mid,
      CASE
        WHEN arr_mid > 0 AND prev_arr_mid = 0 THEN 'new'
        WHEN arr_mid = 0 AND prev_arr_mid > 0 THEN 'churn'
        WHEN arr_mid > 0 AND arr_mid < prev_arr_mid THEN 'contraction'
        WHEN arr_mid > 0 AND arr_mid > prev_arr_mid THEN 'expansion'
        WHEN arr_mid > 0 AND arr_mid = prev_arr_mid AND opportunity_id_mid <> prev_opportunity_id_mid THEN 'renewal'
        WHEN arr_mid > 0 AND arr_mid = prev_arr_mid AND opportunity_id_mid = prev_opportunity_id_mid THEN 'active'
        WHEN arr_mid = 0 AND prev_arr_mid = 0 THEN 'not_active'
        ELSE 'unknown' END AS mid_status
      FROM
      (
        SELECT
        account_id,
        mid_product,
        service_month,
        ROUND(arr_mid, 2) AS arr_mid,
        ROUND(COALESCE(LAG(arr_mid) OVER (PARTITION BY account_id, mid_product ORDER BY service_month ASC), 0), 2) AS prev_arr_mid,
        opportunity_id_mid,
        prev_opportunity_id_mid
        FROM
        (
          SELECT
          account_id,
          mid_product,
          service_month,
          SUM(arr_mid) OVER (PARTITION BY account_id, mid_product ORDER BY service_month ASC) AS arr_mid,
          CONCAT_WS(',', opportunity_id_mid) AS opportunity_id_mid,
          LAG(CONCAT_WS(',', opportunity_id_mid)) OVER (PARTITION BY account_id, mid_product ORDER BY service_month ASC) AS prev_opportunity_id_mid
          FROM
          (
            SELECT
            aa.account_id,
            aa.service_month,
            aa.mid_product,
            arr_mid,
            opportunity_id_mid
            FROM
            (
              SELECT
              account_id,
              mid_product,
              service_month,
              SUM(COALESCE(arr_p, 0) - COALESCE(prev_arr_p, 0)) AS arr_mid
              FROM
              arr_product_status_mid
              GROUP BY
              account_id,
              mid_product,
              service_month
            ) aa
            LEFT JOIN
            (
              SELECT
              account_id,
              mid_product,
              service_month,
              COLLECT_SET(opp) AS opportunity_id_mid
              FROM
              arr_product_status_mid
              LATERAL VIEW EXPLODE (opportunity_id) t AS opp
              GROUP BY
              account_id,
              mid_product,
              service_month
            ) bb
            ON
            aa.account_id = bb.account_id AND aa.service_month = bb.service_month AND aa.mid_product = bb.mid_product
          ) a
        ) b
      ) c
    ),
    arr_full_status AS
    (
      SELECT -- get combined table with all the statuses
      ps.service_month,
      ps.account_id,
      account_name,
      ps.mapped_product,
      ps.mid_product,
      product_name,
      account_status,
      mapped_status,
      mid_status,
      product_status,
      arr_p - prev_arr_p AS arr_p,
      arr_p AS cum_arr_p,
      prev_arr_p AS prev_cum_arr_p,
      -- cum_arr_p,
      -- opportunity_id,
      opportunity_id_p,
      prev_opportunity_id_p,
      opportunity_id_nw
      -- opportunity_json_p
      FROM
      arr_product_status_mid ps
      INNER JOIN
      arr_account aa
      ON
      ps.account_id = aa.account_id AND ps.service_month = aa.service_month
      INNER JOIN
      arr_mapped_product mp
      ON
      ps.account_id = mp.account_id AND ps.service_month =  mp.service_month AND ps.mapped_product =  mp.mapped_product
      INNER JOIN
      arr_mid_product md
      ON
      ps.account_id = md.account_id AND ps.service_month =  md.service_month AND ps.mid_product =  md.mid_product
    )
    SELECT
    service_month,
    account_id,
    account_name,
    mapped_product,
    mid_product,
    product_name,
    account_status,
    mapped_status,
    mid_status,
    product_status,
    arr_p,
    cum_arr_p,
    prev_cum_arr_p,
    opportunity_id_p,
    prev_opportunity_id_p,
    opportunity_id_nw
    FROM
    arr_full_status
"""

In [62]:
if query.startswith('WITH'):
    formatter = column_parser.Parser(query)
    formatted_query = formatter.format_query(query)
    query_list = formatted_query.split('\n')

In [63]:
cte_dict = formatter.parse_cte(formatted_query)
# cte_dict

In [64]:
unbundled = unbundle.Unbundle(formatted_query)
final_list = []
for alias, cte_query in cte_dict.items():
    if unbundled.has_child(cte_query):
        final_list.append({alias: flatten_pure_nested(cte_query)})
    else:
        final_list.append({alias: cte_query})
    

In [70]:
for query_d in final_list: print(query_d, '\n\n')

{'opportunity_product_rev': "SELECT opportunity_id,\n          op.product_2_id, p.name AS product_name,\n                           COALESCE(CAST(quantity AS DOUBLE) * CAST(list_price AS DOUBLE), 0) AS total_price,\n                           netsuite_conn_netsuite_item_key_id_c AS netsuite_conn_net_suite_item_key_id_c,\n                           COUNT(netsuite_conn_netsuite_item_key_id_c) OVER (PARTITION BY opportunity_id) AS num_netsuite_items\n   FROM sfdc.opportunity_product op\n   LEFT JOIN sfdc.products p ON op.product_2_id = p.id\n   AND p.dt = '{run_date}'\n   WHERE op.dt = '{run_date}'\n     AND opportunity_id IS NOT NULL\n     AND product_name_c IS NOT NULL\n     AND op.is_deleted = FALSE "} 


{'product_mapping': [{'level_1_main': "SELECT opportunity_id,        op.product_name,        COALESCE(mapped_product, 'unmapped') AS mapped_product,        CASE            WHEN op.opportunity_id = '0063600000B1icTAAR'                 AND op.product_2_id = '01t36000001SzdWAAS' THEN 0.0

In [69]:
cte_dict.keys()
# opp_product_share --> arr_product_exp wrong

dict_keys(['opportunity_product_rev', 'product_mapping', 'pricing_by_method', 'opportunity_to_name', 'opportunity_arr_tmp_org', 'opportunity_arr_tmp', 'opportunity_arr', 'pricing_by_method_nw', 'account_product', 'arr_by_month_dummy', 'opportunity_product', 'opp_product_share', 'FROM opportunity_product) a), arr_product_exp', 'arr_product_prev', 'arr_product_status', 'arr_product_status_mid', 'arr_account', 'arr_mapped_product', 'arr_mid_product', 'arr_full_status', 'main'])

In [72]:
s = "SELECT pm.opportunity_id,        mapped_product,        product_name,        list_price_value WHERE oa.stage_name IN ('Won',                         '7 - ICR',                         'Won - Pending') UNION ALL SELECT a.opportunity_id,        mapped_product,        product_name,        list_price_value FROM   (SELECT opportunity_id,           prior_opportunity_id    FROM opportunity_arr    WHERE stage_name NOT IN ('Won',                             '7 - ICR',                             'Won - Pending')      AND prior_opportunity_id IS NOT NULL ) a INNER JOIN pricing_by_method pmm ON a.prior_opportunity_id = pmm.opportunity_id FROM pricing_by_method pm INNER JOIN opportunity_arr oa ON pm.opportunity_id = oa.opportunity_id "
print(s)


SELECT pm.opportunity_id,        mapped_product,        product_name,        list_price_value WHERE oa.stage_name IN ('Won',                         '7 - ICR',                         'Won - Pending') UNION ALL SELECT a.opportunity_id,        mapped_product,        product_name,        list_price_value FROM   (SELECT opportunity_id,           prior_opportunity_id    FROM opportunity_arr    WHERE stage_name NOT IN ('Won',                             '7 - ICR',                             'Won - Pending')      AND prior_opportunity_id IS NOT NULL ) a INNER JOIN pricing_by_method pmm ON a.prior_opportunity_id = pmm.opportunity_id FROM pricing_by_method pm INNER JOIN opportunity_arr oa ON pm.opportunity_id = oa.opportunity_id 


## type C: subquery ( CTE ( subqueries ) )

SELECT FROM (SELECT FROM (WITH a AS ()...))

In [None]:
query = """SELECT * FROM (SELECT *\nFROM (\n  with reg_users as (\n  \n  \tselect * \n  \tfrom (\n  \tselect \n  \t        a.*\n  \t        , b.*\n  \t        , c.*\n  \t        , d.*\n  \tfrom \n  \t(\n  \tselect \n  \t        distinct \n  \t        anonymous_id\n  \t        , user_id\n  \tfrom mapbox_customer_data.segment_identifies\n  \twhere dt >= \'2018-07-01\'\n  \tand anonymous_id is not null\n  \tand user_id is not null\n  \t) a\n  \n  \tleft join \n  \n  \t(\n  \tselect \n  \t        id \n  \t        , email\n  \t        , created\n  \tfrom mapbox_customer_data.accounts\n  \twhere cast(dt as DATE) = CURRENT_DATE - INTERVAL \'1\' DAY \n  \t) b\n  \t        on a.user_id = b.id\n  \n  \tleft join \n  \n  \t(\n  \t    \n  \t        select        \n  \t                anonymous_id as anon_id_ad\n  \t                , context_campaign_name\n  \t                , min(timestamp) as min_exposure\n  \t        from mapbox_customer_data.segment_pages\n  \t        where dt >= \'2018-07-01\'\n  \t        and context_campaign_name is not null\n  \t        group by 1,2\n  \n  \t) c \n  \t        on a.anonymous_id = c.anon_id_ad\n  \t        \n  \tleft join \n  \n  \t(\n  \t        select \n  \t                distinct\n  \t                anonymous_id as anon_id_event\n  \t                , original_timestamp\n  \t                , event\n  \t                , context_traits_email\n  \t        from mapbox_customer_data.segment_tracks\n  \t        where dt >= \'2018-07-01\'\n  \t        and event like \'submitted_%form\'\n  \t        and context_traits_email is not null\n  \t) d\n  \t        on a.anonymous_id = d.anon_id_event\n  \n  \t) \n  \twhere context_campaign_name is not null\n  \n  ), \n  \n  non_reg_users as (\n  \n  \tselect \n  \t        context_campaign_name\n  \t        , min_exposure\n  \t        , event\n  \t        , original_timestamp as event_timestamp\n  \t        , context_traits_email as event_email\n  \tfrom (\n  \tselect a.*\n  \t        , b.*\n  \tfrom \n  \t(\n  \t        select \n  \t                anonymous_id as anon_id_ad\n  \t                , context_campaign_name\n  \t                , min(original_timestamp) as min_exposure\n  \t        from (       \n  \t        select        \n  \t                context_campaign_name\n  \t                , anonymous_id\n  \t                , original_timestamp \n  \t        from mapbox_customer_data.segment_pages\n  \t        where dt >= \'2018-07-01\'\n  \t        and context_campaign_name is not null\n  \t        )\n  \t        group by 1,2\n  \t) a\n  \n  \tleft join \n  \t(\n  \t        select \n  \t                distinct\n  \t                anonymous_id as anon_id_event\n  \t                , original_timestamp\n  \t                , event\n  \t                , context_traits_email\n  \t        from mapbox_customer_data.segment_tracks\n  \t        where dt >= \'2018-07-01\'\n  \t        and event like \'submitted_%form\'\n  \t        and context_traits_email is not null\n  \t) b\n  \t        on a.anon_id_ad = b.anon_id_event\n  \n  \t)\n  \twhere anon_id_event is not null\n  \tand to_unixtime(min_exposure) <= to_unixtime(original_timestamp)\n  \tand cast(min_exposure as DATE) >= cast(original_timestamp as DATE) - INTERVAL \'28\' DAY\n  \n  \n  ), \n  \n  mql_flag as (\n  \n  \tselect \n  \t        email\n  \t        , created_date\n  \t        , last_mql_date_c\n  \t        , mql_flag\n  \tfrom (\n  \n  \tselect \n  \t        email\n  \t        , min(created_date) created_date\n  \t        , max(last_mql_date_c) last_mql_date_c\n  \t        , case when max(last_mql_date_c) is not null then 1 else 0 end as mql_flag\n  \t        , sum(case when is_deleted = true then 1 else 0 end) as is_deleted\n  \tfrom sales.salesforce_leads\n  \twhere cast(dt as DATE) = CURRENT_DATE - INTERVAL \'1\' DAY \n  \tgroup by 1\n  \t)\n  \twhere mql_flag = 1\n  \tand is_deleted = 0\n  \n  ),\n  \n  cleaned_list as (\n  \n  \n  \tselect \n  \t\tdistinct \n  \t\t\t*\n  \tfrom (\n  \t\tselect \n  \t\t        context_campaign_name\n  \t\t        , min_exposure\n  \t\t        , \'created_an_account\' as event\n  \t\t        , created as event_timestamp\n  \t\t        , email as event_email\n  \t\tfrom reg_users        \n  \t\twhere to_unixtime(min_exposure) <= to_unixtime(created)\n  \t\tand cast(min_exposure as DATE) >= cast(created as DATE) - INTERVAL \'28\' DAY\n  \n  \n  \t\tunion all\n  \n  \n  \t\tselect \n  \t\t        context_campaign_name\n  \t\t        , min_exposure\n  \t\t        , event\n  \t\t        , original_timestamp as event_timestamp\n  \t\t        , context_traits_email as event_email\n  \t\tfrom reg_users      \n  \t\twhere to_unixtime(min_exposure) <= to_unixtime(original_timestamp)\n  \t\tand cast(min_exposure as DATE) >= cast(original_timestamp as DATE) - INTERVAL \'28\' DAY\n  \n  \t\tunion all\n  \n  \t\tselect * \n  \t\tfrom non_reg_users\n  \t)\n  \n  )\n  \n  \n  \n  select \n  \ta.*\n  \t, b.*\n  from cleaned_list a\n  left join mql_flag b \n  \ton a.event_email = b.email\n) "custom_sql_query"\nLIMIT 0) T LIMIT 0
"""
