In [3]:
from sqlanalyzer import column_parser
import pandas as pd
import sqlparse
import re


In [74]:
def get_joins_pos(query_list):

    pos_delete, pos_where = [len(query_list)-1], len(query_list)
    pos_join = []
    for i, line in enumerate(query_list):
        if line.startswith('ORDER') or line.startswith('GROUP'):
            pos_delete.append(i)
        if line.startswith('FROM') and len(line.split(' ')) > 1:
            pos_join.append(i)
        elif line.startswith('FROM') and len(line.split(' ')) == 1:
            pos_join.append(i+1)
        if line.startswith('WHERE'):
            pos_where = i
        if line.startswith('LEFT JOIN') or line.startswith('INNER JOIN') or line.startswith('FULL OUTER JOIN'):
            pos_join.append(i+1)

    pos_join.append(min(pos_delete))
    return pos_join, pos_where


def get_alias_pos(query_list, pos_join, pos_where):
    pos_join_list = iter(pos_join)
    next(pos_join_list)
    alias_pos = []

    if query_list[pos_join[0]].startswith('FROM'):
        alias_pos.append(pos_join[0])

    for i in range(len(pos_join)-1):
        if i < len(pos_join)-2 and pos_join[i] < pos_where:
            end_pos = next(pos_join_list)-1
            alias_pos.append(end_pos-1)

        elif pos_join[-1] >= pos_where:
            end_pos = next(pos_join_list)-1
            alias_pos.append(pos_where - 1)
        
        elif pos_where == len(query_list):
            end_pos = pos_join[-1]
            alias_pos.append(pos_where-1)
        else:
            end_pos = pos_join[-1]
            alias_pos.append(end_pos)

    alias_pos = sorted(list(set(alias_pos)))
    return alias_pos


def parse_sub_query(query_list, sub_query_pos):
    sub_query = {}
    for _, sub_pos in enumerate(sub_query_pos):
        alias = query_list[sub_pos[1]]
        query = query_list[sub_pos[0]: sub_pos[1]]

        try:
            alias_list_rev = alias.split(' ')[::-1]
            if alias_list_rev[0][-1] != ')':
                alias_index = alias_list_rev.index('ON')
                alias = alias_list_rev[alias_index+1]

                if alias_list_rev[alias_index+2] == 'AS':
                    del alias_list_rev[:alias_index+3]

                else:
                    del alias_list_rev[:alias_index+2]

                query.append(' '.join(alias_list_rev[::-1]).rstrip(r'\)').lstrip(' '))

            else:
                alias_list_rev[0] = alias_list_rev[0].rstrip('\)')
                alias = 'no alias'
                query.append(' '.join(alias_list_rev[::-1]))

        except:
            query.append(' '.join(alias.split(' ')[:-1]).rstrip(r'\)').lstrip(' '))
            alias = alias.split(' ')[-1]

        trans_query = ' '.join(query).lstrip(' \(').lstrip(' FROM')
    
        if trans_query == '':
            sub_query = {}
        else:
            sub_query[alias] = trans_query
        
    return sub_query


def delevel(query_list):

    sub_query = {}
    pos_join, pos_where = get_joins_pos(query_list)
    alias_pos = get_alias_pos(query_list, pos_join, pos_where)
    sub_query_pos = list(zip(pos_join[:-1], alias_pos))
    sub_query = parse_sub_query(query_list, sub_query_pos)

    return sub_query


def has_child(formatted_query):
    
    count = 0
    for k,v in delevel(formatted_query.split('\n')).items():
        if v != {}: count += 1
            
    return count != 0


In [87]:
query = """WITH reg_users AS
  (SELECT *
   FROM
     (SELECT a.*,
             b.*,
             c.*,
             d.*
      FROM
        (SELECT DISTINCT anonymous_id,
                         user_id
         FROM mapbox_customer_data.segment_identifies
         WHERE dt >= '2018-07-01'
           AND anonymous_id IS NOT NULL
           AND user_id IS NOT NULL ) a
      LEFT JOIN
        (SELECT id,
                email,
                created
         FROM mapbox_customer_data.accounts
         WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON a.user_id = b.id
      LEFT JOIN
        (SELECT anonymous_id AS anon_id_ad,
                context_campaign_name,
                min(TIMESTAMP) AS min_exposure
         FROM mapbox_customer_data.segment_pages
         WHERE dt >= '2018-07-01'
           AND context_campaign_name IS NOT NULL
         GROUP BY 1,
                  2) c ON a.anonymous_id = c.anon_id_ad
      LEFT JOIN
        (SELECT DISTINCT anonymous_id AS anon_id_event,
                         original_timestamp,
                         event,
                         context_traits_email
         FROM mapbox_customer_data.segment_tracks
         WHERE dt >= '2018-07-01'
           AND event LIKE 'submitted_%form'
           AND context_traits_email IS NOT NULL ) d ON a.anonymous_id = d.anon_id_event)
   WHERE context_campaign_name IS NOT NULL ),

     non_reg_users AS
  (SELECT context_campaign_name,
          min_exposure,
          event,
          original_timestamp AS event_timestamp,
          context_traits_email AS event_email
   FROM
     (SELECT a.*,
             b.*
      FROM
        (SELECT anonymous_id AS anon_id_ad,
                context_campaign_name,
                min(original_timestamp) AS min_exposure
         FROM
           (SELECT context_campaign_name,
                   anonymous_id,
                   original_timestamp
            FROM mapbox_customer_data.segment_pages
            WHERE dt >= '2018-07-01'
              AND context_campaign_name IS NOT NULL )
         GROUP BY 1,
                  2) a
      LEFT JOIN
        (SELECT DISTINCT anonymous_id AS anon_id_event,
                         original_timestamp,
                         event,
                         context_traits_email
         FROM mapbox_customer_data.segment_tracks
         WHERE dt >= '2018-07-01'
           AND event LIKE 'submitted_%form'
           AND context_traits_email IS NOT NULL ) b ON a.anon_id_ad = b.anon_id_event)
   WHERE anon_id_event IS NOT NULL
     AND to_unixtime(min_exposure) <= to_unixtime(original_timestamp)
     AND cast(min_exposure AS DATE) >= cast(original_timestamp AS DATE) - INTERVAL '28' DAY ),

     mql_flag AS
  (SELECT email,
          created_date,
          last_mql_date_c,
          mql_flag
   FROM
     (SELECT email,
             min(created_date) created_date,
             max(last_mql_date_c) last_mql_date_c,
             CASE
                 WHEN max(last_mql_date_c) IS NOT NULL THEN 1
                 ELSE 0
             END AS mql_flag,
             sum(CASE
                     WHEN is_deleted = TRUE THEN 1
                     ELSE 0
                 END) AS is_deleted
      FROM sales.salesforce_leads
      WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY
      GROUP BY 1)
   WHERE mql_flag = 1
     AND is_deleted = 0 ),

     cleaned_list AS
  (SELECT DISTINCT *
   FROM
     (SELECT context_campaign_name,
             min_exposure,
             'created_an_account' AS event,
             created AS event_timestamp,
             email AS event_email
      FROM reg_users
      WHERE to_unixtime(min_exposure) <= to_unixtime(created)
        AND cast(min_exposure AS DATE) >= cast(created AS DATE) - INTERVAL '28' DAY
      UNION ALL SELECT context_campaign_name,
                       min_exposure,
                       event,
                       original_timestamp AS event_timestamp,
                       context_traits_email AS event_email
      FROM reg_users
      WHERE to_unixtime(min_exposure) <= to_unixtime(original_timestamp)
        AND cast(min_exposure AS DATE) >= cast(original_timestamp AS DATE) - INTERVAL '28' DAY
      UNION ALL SELECT *
      FROM non_reg_users))

SELECT a.*,
       b.*
FROM cleaned_list a
LEFT JOIN mql_flag b ON a.event_email = b.email
"""

In [88]:
formatter = column_parser.Parser(query)
formatted_query = formatter.format_query(query)
query_list = formatted_query.split('\n')

In [89]:
print(formatted_query)

WITH reg_users AS
  (SELECT *
   FROM
     (SELECT a.*,
             b.*,
             c.*,
             d.*
      FROM
        (SELECT DISTINCT anonymous_id,
                         user_id
         FROM mapbox_customer_data.segment_identifies
         WHERE dt >= '2018-07-01'
           AND anonymous_id IS NOT NULL
           AND user_id IS NOT NULL ) a
      LEFT JOIN
        (SELECT id,
                email,
                created
         FROM mapbox_customer_data.accounts
         WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON a.user_id = b.id
      LEFT JOIN
        (SELECT anonymous_id AS anon_id_ad,
                context_campaign_name,
                min(TIMESTAMP) AS min_exposure
         FROM mapbox_customer_data.segment_pages
         WHERE dt >= '2018-07-01'
           AND context_campaign_name IS NOT NULL
         GROUP BY 1,
                  2) c ON a.anonymous_id = c.anon_id_ad
      LEFT JOIN
        (SELECT DISTINCT anonymous_id AS anon_id_even

In [78]:
def is_cte(query):
    return query.startswith('WITH')

In [79]:
is_cte(formatted_query)

True

In [80]:
if is_cte(formatted_query):
    cte_dict = formatter.parse_cte(formatted_query)

In [81]:
cte_dict.keys()

dict_keys(['reg_users', 'non_reg_users', 'mql_flag', 'cleaned_list', 'main'])

In [84]:
def main(query):
    formatter = column_parser.Parser(query)
    formatted_query = formatter.format_query(query)
    query_list_0 = formatted_query.split('\n')
    query_dict = {}
    sub_query = delevel(query_list_0)
#     query_dict['derived_query'] = sub_query
    query_dict = sub_query

    for alias, query in sub_query.items():
        formatter = column_parser.Parser(query)
        formatted_query = formatter.format_query(query)
        query_list = formatted_query.split('\n')
        if has_child(formatted_query):
            sub_query_dict = delevel(query_list)
            query_dict[alias] = sub_query_dict
            
            for alias2, query2 in sub_query_dict.items():
                formatter2 = column_parser.Parser(query2)
                formatted_query2 = formatter2.format_query(query2)
                query_list2 = formatted_query2.split('\n')
                
                if has_child(formatted_query2):
                    sub_query_dict2 = delevel(query_list2)
                    sub_query_dict[alias2] = sub_query_dict2
                    
                    for alias3, query3 in sub_query_dict2.items():
                        formatter3 = column_parser.Parser(query3)
                        formatted_query3 = formatter3.format_query(query3)
                        query_list3 = formatted_query3.split('\n')
                        if has_child(formatted_query3):
                            sub_query_dict3 = delevel(query_list3)
                            sub_query_dict2[alias3] = sub_query_dict3
                            
                            for alias4, query4 in sub_query_dict3.items():
                                formatter4 = column_parser.Parser(query4)
                                formatted_query4 = formatter4.format_query(query4)
                                query_list4 = formatted_query4.split('\n')
                                if has_child(formatted_query4):
                                    sub_query_dict4 = delevel(query_list4)
                                    sub_query_dict3[alias4] = sub_query_dict4
                                else:
                                    pass
                        else:
                            pass
                else:
                    pass
        else:
            pass

    return query_dict

In [85]:
for alias, query in cte_dict.items():
    formatter = column_parser.Parser(query)
    formatted_query = formatter.format_query(query)
    try:
        print(alias, '\n', main(query), '\n\n\n')
    except:
        pass

reg_users 
 {'no alias': {'a': "SELECT DISTINCT anonymous_id,                    user_id    FROM mapbox_customer_data.segment_identifies    WHERE dt >= '2018-07-01'      AND anonymous_id IS NOT NULL AND user_id IS NOT NULL ", 'b': "SELECT id,           email,           created    FROM mapbox_customer_data.accounts WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ", 'c': "SELECT anonymous_id AS anon_id_ad,           context_campaign_name,           min(TIMESTAMP) AS min_exposure    FROM mapbox_customer_data.segment_pages    WHERE dt >= '2018-07-01'      AND context_campaign_name IS NOT NULL    GROUP BY 1, 2", 'd': "SELECT DISTINCT anonymous_id AS anon_id_event,                    original_timestamp,                    event,                    context_traits_email    FROM mapbox_customer_data.segment_tracks    WHERE dt >= '2018-07-01'      AND event LIKE 'submitted_%form' AND context_traits_email IS NOT NULL "}} 



non_reg_users 
 {'no alias': {'a': {'2': "SELECT context_campai

In [33]:
formatter = column_parser.Parser(cte_dict['main'])
formatted_query = formatter.format_query(cte_dict['main'])
has_child(formatted_query)


True

In [38]:
formatted_query

'SELECT a.*,\n       b.*\nFROM cleaned_list a\nLEFT JOIN mql_flag b ON a.event_email = b.email'

In [35]:
query_dict = {}
query_list = formatted_query.split('\n')
sub_query_dict = delevel(query_list)
query_dict = sub_query_dict


In [36]:
query_dict

{'a': 'cleaned_list', 'b': 'LEFT JOIN mql_flag'}

In [44]:
for _,v in delevel(formatted_query.split('\n')).items():
    print(v)

cleaned_list
LEFT JOIN mql_flag


In [45]:
has_child('LEFT JOIN mql_flag')

IndexError: list index out of range