In [1]:
from sqlanalyzer import column_parser, unbundle
import sqlparse
import re
import json
import pandas as pd
import time


def flatten_subquery(final_list, sub_queries, level_num):
    
    for q in sub_queries:
        for alias,query in q.items():
            formatter = column_parser.Parser(query)
            formatted_query = formatter.format_query(query)
            unbundled = unbundle.Unbundle(formatted_query)
            query_dict = {}
            if unbundled.has_child(query):
                if alias == 'no alias' or alias == '' or alias == 'query':
                    query_dict, sub_queries = unbundled.restructure_subquery(query_dict, 'level_{}_main'.format(level_num), formatted_query)
                else:
                    query_dict, sub_queries = unbundled.restructure_subquery(query_dict, alias, formatted_query)
            else: 
                sub_queries = []

        if query_dict != {}:
            final_list.append(query_dict)

        for subq in sub_queries:
            for _, sub_query in subq.items():
                if not unbundled.has_child(sub_query): 
                    final_list.append(subq)

    return final_list, sub_queries


def is_cte(query):
    return query.startswith('WITH')


def flatten_pure_nested(query):

    sub_queries = [{'query': query}]
    final_list = []
    i = 0

    while sub_queries != []:
        i += 1
        final_list, sub_queries = flatten_subquery(final_list, sub_queries, level_num=i)

    return final_list

## type A: subquery ( subqueries ) 

SELECT FROM (SELECT FROM (...))

In [2]:
query = """SELECT *
   FROM
     (SELECT a.*,
             b.*,
             c.*,
             d.*
      FROM
        (SELECT DISTINCT anonymous_id,
                         user_id
         FROM mapbox_customer_data.segment_identifies
         WHERE dt >= '2018-07-01'
           AND anonymous_id IS NOT NULL
           AND user_id IS NOT NULL ) a
      LEFT JOIN
        (SELECT id,
                email,
                created
         FROM mapbox_customer_data.accounts
         WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON a.user_id = b.id
      LEFT JOIN
        (SELECT anonymous_id AS anon_id_ad,
                context_campaign_name,
                min(TIMESTAMP) AS min_exposure
         FROM mapbox_customer_data.segment_pages
         WHERE dt >= '2018-07-01'
           AND context_campaign_name IS NOT NULL
         GROUP BY 1,
                  2) c ON a.anonymous_id = c.anon_id_ad
      LEFT JOIN
        (SELECT DISTINCT anonymous_id AS anon_id_event,
                         original_timestamp,
                         event,
                         context_traits_email
         FROM mapbox_customer_data.segment_tracks
         WHERE dt >= '2018-07-01'
           AND event LIKE 'submitted_%form'
           AND context_traits_email IS NOT NULL ) d ON a.anonymous_id = d.anon_id_event
    LEFT JOIN
        (SELECT sfdc_accounts.platform, sfdc_accounts.mobile_os, sfdc_accounts.service_metadata,
sfdc_cases.account, sfdc_cases.num_requests, sfdc_cases.owner, sfdc_accounts.user_id
FROM sfdc.accounts sfdc_accounts
LEFT JOIN 
(SELECT MAX(dt) FROM 
    (SELECT dt 
    FROM sfdc.oppty 
    LEFT JOIN (SELECT MAX(dt) FROM (SELECT DISTINCT dt FROM sfdc.owner AS sfdc_owner) AS dt_owner ON sfdc_oppty.dt = sfdc_cases.dt)
    LEFT JOIN (SELECT dt FROM sfdc.cases) sfdc_cases ON sfdc_oppty.dt = sfdc_cases.dt) )
AS sfdc_cases_oppty ON sfdc_cases_oppty.dt = sfdc_accounts.dt
LEFT JOIN sfdc.cases AS sfdc_cases ON sfdc_cases.id = sfdc_accounts.case_id
WHERE sfdc_cases_oppty.dt > '2020-04-03' AND sfdc_cases_oppty.dt < '2020-05-04' ORDER BY 1 GROUP BY 3 LIMIT 20
        ) e ON e.user_id = a.user_id
        )
   WHERE context_campaign_name IS NOT NULL 
"""


In [3]:
flatten_pure_nested(query)

[{'level_1_main': 'SELECT * WHERE context_campaign_name IS NOT NULL FROM no alias '},
 {'level_2_main': 'SELECT a.*,        b.*,        c.*,        d.* FROM a LEFT JOIN b ON a.user_id = b.id LEFT JOIN c ON a.anonymous_id = c.anon_id_ad LEFT JOIN d ON a.anonymous_id = d.anon_id_event LEFT JOIN e ON e.user_id = a.user_id '},
 {'a': "SELECT DISTINCT anonymous_id, user_id FROM mapbox_customer_data.segment_identifies WHERE dt >= '2018-07-01' AND anonymous_id IS NOT NULL AND user_id IS NOT NULL "},
 {'b': "SELECT id, email, created FROM mapbox_customer_data.accounts WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY "},
 {'c': "SELECT anonymous_id AS anon_id_ad, context_campaign_name, min(TIMESTAMP) AS min_exposure FROM mapbox_customer_data.segment_pages WHERE dt >= '2018-07-01' AND context_campaign_name IS NOT NULL GROUP BY 1, 2"},
 {'d': "SELECT DISTINCT anonymous_id AS anon_id_event, original_timestamp, event, context_traits_email FROM mapbox_customer_data.segment_tracks WHERE dt >=

## type B: CTE ( subqueries )

WITH a AS ()...

In [4]:
query = """WITH reg_users AS\n  (SELECT *\n   FROM\n     (SELECT a.*,\n             b.*,\n             c.*,\n             d.*\n      FROM\n        (SELECT DISTINCT anonymous_id,\n                         user_id\n         FROM mapbox_customer_data.segment_identifies\n         WHERE dt >= '2018-07-01'\n           AND anonymous_id IS NOT NULL\n           AND user_id IS NOT NULL ) a\n      LEFT JOIN\n        (SELECT id,\n                email,\n                created\n         FROM mapbox_customer_data.accounts\n         WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON a.user_id = b.id\n      LEFT JOIN\n        (SELECT anonymous_id AS anon_id_ad,\n                context_campaign_name,\n                min(TIMESTAMP) AS min_exposure\n         FROM mapbox_customer_data.segment_pages\n         WHERE dt >= '2018-07-01'\n           AND context_campaign_name IS NOT NULL\n         GROUP BY 1,\n                  2) c ON a.anonymous_id = c.anon_id_ad\n      LEFT JOIN\n        (SELECT DISTINCT anonymous_id AS anon_id_event,\n                         original_timestamp,\n                         event,\n                         context_traits_email\n         FROM mapbox_customer_data.segment_tracks\n         WHERE dt >= '2018-07-01'\n           AND event LIKE 'submitted_%form'\n           AND context_traits_email IS NOT NULL ) d ON a.anonymous_id = d.anon_id_event)\n   WHERE context_campaign_name IS NOT NULL ),\n\n     non_reg_users AS\n  (SELECT context_campaign_name,\n          min_exposure,\n          event,\n          original_timestamp AS event_timestamp,\n          context_traits_email AS event_email\n   FROM\n     (SELECT a.*,\n             b.*\n      FROM\n        (SELECT anonymous_id AS anon_id_ad,\n                context_campaign_name,\n                min(original_timestamp) AS min_exposure\n         FROM\n           (SELECT context_campaign_name,\n                   anonymous_id,\n                   original_timestamp\n            FROM mapbox_customer_data.segment_pages\n            WHERE dt >= '2018-07-01'\n              AND context_campaign_name IS NOT NULL )\n         GROUP BY 1,\n                  2) a\n      LEFT JOIN\n        (SELECT DISTINCT anonymous_id AS anon_id_event,\n                         original_timestamp,\n                         event,\n                         context_traits_email\n         FROM mapbox_customer_data.segment_tracks\n         WHERE dt >= '2018-07-01'\n           AND event LIKE 'submitted_%form'\n           AND context_traits_email IS NOT NULL ) b ON a.anon_id_ad = b.anon_id_event)\n   WHERE anon_id_event IS NOT NULL\n     AND to_unixtime(min_exposure) <= to_unixtime(original_timestamp)\n     AND cast(min_exposure AS DATE) >= cast(original_timestamp AS DATE) - INTERVAL '28' DAY ),\n\n     mql_flag AS\n  (SELECT email,\n          created_date,\n          last_mql_date_c,\n          mql_flag\n   FROM\n     (SELECT email,\n             min(created_date) created_date,\n             max(last_mql_date_c) last_mql_date_c,\n             CASE\n                 WHEN max(last_mql_date_c) IS NOT NULL THEN 1\n                 ELSE 0\n             END AS mql_flag,\n             sum(CASE\n                     WHEN is_deleted = TRUE THEN 1\n                     ELSE 0\n                 END) AS is_deleted\n      FROM sales.salesforce_leads\n      WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY\n      GROUP BY 1)\n   WHERE mql_flag = 1\n     AND is_deleted = 0 ),\n\n     cleaned_list AS\n  (SELECT DISTINCT *\n   FROM\n     (SELECT context_campaign_name,\n             min_exposure,\n             'created_an_account' AS event,\n             created AS event_timestamp,\n             email AS event_email\n      FROM reg_users\n      WHERE to_unixtime(min_exposure) <= to_unixtime(created)\n        AND cast(min_exposure AS DATE) >= cast(created AS DATE) - INTERVAL '28' DAY\n      UNION ALL SELECT context_campaign_name,\n                       min_exposure,\n                       event,\n                       original_timestamp AS event_timestamp,\n                       context_traits_email AS event_email\n      FROM reg_users\n      WHERE to_unixtime(min_exposure) <= to_unixtime(original_timestamp)\n        AND cast(min_exposure AS DATE) >= cast(original_timestamp AS DATE) - INTERVAL '28' DAY\n      UNION ALL SELECT *\n      FROM non_reg_users))\n\nSELECT a.*,\n       b.*\nFROM cleaned_list a\nLEFT JOIN mql_flag b ON a.event_email = b.email\n\n"
"""

In [5]:
if query.startswith('WITH'):
    formatter = column_parser.Parser(query)
    formatted_query = formatter.format_query(query)
    query_list = formatted_query.split('\n')

In [6]:
cte_dict = formatter.parse_cte(formatted_query)
cte_dict

{'reg_users': "SELECT *\n   FROM\n     (SELECT a.*,\n             b.*,\n             c.*,\n             d.*\n      FROM\n        (SELECT DISTINCT anonymous_id,\n                         user_id\n         FROM mapbox_customer_data.segment_identifies\n         WHERE dt >= '2018-07-01'\n           AND anonymous_id IS NOT NULL\n           AND user_id IS NOT NULL ) a\n      LEFT JOIN\n        (SELECT id,\n                email,\n                created\n         FROM mapbox_customer_data.accounts\n         WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON a.user_id = b.id\n      LEFT JOIN\n        (SELECT anonymous_id AS anon_id_ad,\n                context_campaign_name,\n                min(TIMESTAMP) AS min_exposure\n         FROM mapbox_customer_data.segment_pages\n         WHERE dt >= '2018-07-01'\n           AND context_campaign_name IS NOT NULL\n         GROUP BY 1,\n                  2) c ON a.anonymous_id = c.anon_id_ad\n      LEFT JOIN\n        (SELECT DISTINCT anony

In [12]:
unbundled = unbundle.Unbundle(formatted_query)
final_list = []
for alias, cte_query in cte_dict.items():
    if unbundled.has_child(cte_query):
        final_list.append({alias: flatten_pure_nested(cte_query)})
    else:
        final_list.append({alias: cte_query})
    

## type C: subquery ( CTE ( subqueries ) )

SELECT FROM (SELECT FROM (WITH a AS ()...))

In [None]:
query = """SELECT * FROM (SELECT *\nFROM (\n  with reg_users as (\n  \n  \tselect * \n  \tfrom (\n  \tselect \n  \t        a.*\n  \t        , b.*\n  \t        , c.*\n  \t        , d.*\n  \tfrom \n  \t(\n  \tselect \n  \t        distinct \n  \t        anonymous_id\n  \t        , user_id\n  \tfrom mapbox_customer_data.segment_identifies\n  \twhere dt >= \'2018-07-01\'\n  \tand anonymous_id is not null\n  \tand user_id is not null\n  \t) a\n  \n  \tleft join \n  \n  \t(\n  \tselect \n  \t        id \n  \t        , email\n  \t        , created\n  \tfrom mapbox_customer_data.accounts\n  \twhere cast(dt as DATE) = CURRENT_DATE - INTERVAL \'1\' DAY \n  \t) b\n  \t        on a.user_id = b.id\n  \n  \tleft join \n  \n  \t(\n  \t    \n  \t        select        \n  \t                anonymous_id as anon_id_ad\n  \t                , context_campaign_name\n  \t                , min(timestamp) as min_exposure\n  \t        from mapbox_customer_data.segment_pages\n  \t        where dt >= \'2018-07-01\'\n  \t        and context_campaign_name is not null\n  \t        group by 1,2\n  \n  \t) c \n  \t        on a.anonymous_id = c.anon_id_ad\n  \t        \n  \tleft join \n  \n  \t(\n  \t        select \n  \t                distinct\n  \t                anonymous_id as anon_id_event\n  \t                , original_timestamp\n  \t                , event\n  \t                , context_traits_email\n  \t        from mapbox_customer_data.segment_tracks\n  \t        where dt >= \'2018-07-01\'\n  \t        and event like \'submitted_%form\'\n  \t        and context_traits_email is not null\n  \t) d\n  \t        on a.anonymous_id = d.anon_id_event\n  \n  \t) \n  \twhere context_campaign_name is not null\n  \n  ), \n  \n  non_reg_users as (\n  \n  \tselect \n  \t        context_campaign_name\n  \t        , min_exposure\n  \t        , event\n  \t        , original_timestamp as event_timestamp\n  \t        , context_traits_email as event_email\n  \tfrom (\n  \tselect a.*\n  \t        , b.*\n  \tfrom \n  \t(\n  \t        select \n  \t                anonymous_id as anon_id_ad\n  \t                , context_campaign_name\n  \t                , min(original_timestamp) as min_exposure\n  \t        from (       \n  \t        select        \n  \t                context_campaign_name\n  \t                , anonymous_id\n  \t                , original_timestamp \n  \t        from mapbox_customer_data.segment_pages\n  \t        where dt >= \'2018-07-01\'\n  \t        and context_campaign_name is not null\n  \t        )\n  \t        group by 1,2\n  \t) a\n  \n  \tleft join \n  \t(\n  \t        select \n  \t                distinct\n  \t                anonymous_id as anon_id_event\n  \t                , original_timestamp\n  \t                , event\n  \t                , context_traits_email\n  \t        from mapbox_customer_data.segment_tracks\n  \t        where dt >= \'2018-07-01\'\n  \t        and event like \'submitted_%form\'\n  \t        and context_traits_email is not null\n  \t) b\n  \t        on a.anon_id_ad = b.anon_id_event\n  \n  \t)\n  \twhere anon_id_event is not null\n  \tand to_unixtime(min_exposure) <= to_unixtime(original_timestamp)\n  \tand cast(min_exposure as DATE) >= cast(original_timestamp as DATE) - INTERVAL \'28\' DAY\n  \n  \n  ), \n  \n  mql_flag as (\n  \n  \tselect \n  \t        email\n  \t        , created_date\n  \t        , last_mql_date_c\n  \t        , mql_flag\n  \tfrom (\n  \n  \tselect \n  \t        email\n  \t        , min(created_date) created_date\n  \t        , max(last_mql_date_c) last_mql_date_c\n  \t        , case when max(last_mql_date_c) is not null then 1 else 0 end as mql_flag\n  \t        , sum(case when is_deleted = true then 1 else 0 end) as is_deleted\n  \tfrom sales.salesforce_leads\n  \twhere cast(dt as DATE) = CURRENT_DATE - INTERVAL \'1\' DAY \n  \tgroup by 1\n  \t)\n  \twhere mql_flag = 1\n  \tand is_deleted = 0\n  \n  ),\n  \n  cleaned_list as (\n  \n  \n  \tselect \n  \t\tdistinct \n  \t\t\t*\n  \tfrom (\n  \t\tselect \n  \t\t        context_campaign_name\n  \t\t        , min_exposure\n  \t\t        , \'created_an_account\' as event\n  \t\t        , created as event_timestamp\n  \t\t        , email as event_email\n  \t\tfrom reg_users        \n  \t\twhere to_unixtime(min_exposure) <= to_unixtime(created)\n  \t\tand cast(min_exposure as DATE) >= cast(created as DATE) - INTERVAL \'28\' DAY\n  \n  \n  \t\tunion all\n  \n  \n  \t\tselect \n  \t\t        context_campaign_name\n  \t\t        , min_exposure\n  \t\t        , event\n  \t\t        , original_timestamp as event_timestamp\n  \t\t        , context_traits_email as event_email\n  \t\tfrom reg_users      \n  \t\twhere to_unixtime(min_exposure) <= to_unixtime(original_timestamp)\n  \t\tand cast(min_exposure as DATE) >= cast(original_timestamp as DATE) - INTERVAL \'28\' DAY\n  \n  \t\tunion all\n  \n  \t\tselect * \n  \t\tfrom non_reg_users\n  \t)\n  \n  )\n  \n  \n  \n  select \n  \ta.*\n  \t, b.*\n  from cleaned_list a\n  left join mql_flag b \n  \ton a.event_email = b.email\n) "custom_sql_query"\nLIMIT 0) T LIMIT 0
"""
