In [1]:
from sqlanalyzer import column_parser, unbundle, query_analyzer
import re, json, time, sys
import pandas as pd

## given db metadata

In [7]:
def extract_subquery_fields(query, db_fields):
    formatter = column_parser.Parser(query)
    formatted = formatter.format_query(query)
    fields = formatter.match_queried_fields(formatted, db_fields)
    return fields


def compile_queried_cols(col_list, query_list):
    
    for q in query_list:
        
        for alias, query in q.items():
            
            if isinstance(query, str):
                col_list.extend(extract_subquery_fields(query, db_fields))
                query_list = []
                
            elif isinstance(query, list):
                query_list = query
    
    return col_list, query_list
 
    
def unnest_query_list(query_list):
    preprocess_list = []
    
    for q in query_list:
        for _, query in q.items():
            
            if isinstance(query, str):
                preprocess_list.append(query)
            else:
                
                for sub_q in query:
                    sub_list = []
                    for _, sub_query in sub_q.items():
                        
                        if isinstance(sub_query, str):
                            sub_list.append(sub_query)
                            
                        else:
                            for sub_sub_q in sub_query:
                                for _, sub_sub_query in sub_sub_q.items():
                                    
                                    if isinstance(sub_sub_query, str):
                                        sub_list.append(sub_sub_query)
                                        
                    preprocess_list.extend(sub_list)
                    
    return preprocess_list


In [8]:
query = """WITH reg_users AS
  (SELECT *
   FROM
     (SELECT a.*,
             b.*,
             c.*,
             d.*
      FROM
        (SELECT DISTINCT anonymous_id,
                         user_id
         FROM mapbox_customer_data.segment_identifies
         WHERE dt >= '2018-07-01'
           AND anonymous_id IS NOT NULL
           AND user_id IS NOT NULL ) a
      LEFT JOIN
        (SELECT id,
                email,
                created
         FROM mapbox_customer_data.accounts
         WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON a.user_id = b.id
      LEFT JOIN
        (SELECT anonymous_id AS anon_id_ad,
                context_campaign_name,
                min(TIMESTAMP) AS min_exposure
         FROM mapbox_customer_data.segment_pages
         WHERE dt >= '2018-07-01'
           AND context_campaign_name IS NOT NULL
         GROUP BY 1,
                  2) c ON a.anonymous_id = c.anon_id_ad
      LEFT JOIN
        (SELECT DISTINCT anonymous_id AS anon_id_event,
                         original_timestamp,
                         event,
                         context_traits_email
         FROM mapbox_customer_data.segment_tracks
         WHERE dt >= '2018-07-01'
           AND event LIKE 'submitted_%form'
           AND context_traits_email IS NOT NULL ) d ON a.anonymous_id = d.anon_id_event)
   WHERE context_campaign_name IS NOT NULL ),

     non_reg_users AS
  (SELECT context_campaign_name,
          min_exposure,
          event,
          original_timestamp AS event_timestamp,
          context_traits_email AS event_email
   FROM
     (SELECT a.*,
             b.*
      FROM
        (SELECT anonymous_id AS anon_id_ad,
                context_campaign_name,
                min(original_timestamp) AS min_exposure
         FROM
           (SELECT context_campaign_name,
                   anonymous_id,
                   original_timestamp
            FROM mapbox_customer_data.segment_pages
            WHERE dt >= '2018-07-01'
              AND context_campaign_name IS NOT NULL )
         GROUP BY 1,
                  2) a
      LEFT JOIN
        (SELECT DISTINCT anonymous_id AS anon_id_event,
                         original_timestamp,
                         event,
                         context_traits_email
         FROM mapbox_customer_data.segment_tracks
         WHERE dt >= '2018-07-01'
           AND event LIKE 'submitted_%form'
           AND context_traits_email IS NOT NULL ) b ON a.anon_id_ad = b.anon_id_event)
   WHERE anon_id_event IS NOT NULL
     AND to_unixtime(min_exposure) <= to_unixtime(original_timestamp)
     AND cast(min_exposure AS DATE) >= cast(original_timestamp AS DATE) - INTERVAL '28' DAY ),

     mql_flag AS
  (SELECT email,
          created_date,
          last_mql_date_c,
          mql_flag
   FROM
     (SELECT email,
             min(created_date) created_date,
             max(last_mql_date_c) last_mql_date_c,
             CASE
                 WHEN max(last_mql_date_c) IS NOT NULL THEN 1
                 ELSE 0
             END AS mql_flag,
             sum(CASE
                     WHEN is_deleted = TRUE THEN 1
                     ELSE 0
                 END) AS is_deleted
      FROM sales.salesforce_leads
      WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY
      GROUP BY 1)
   WHERE mql_flag = 1
     AND is_deleted = 0 ),

     cleaned_list AS
  (SELECT DISTINCT *
   FROM
     (SELECT context_campaign_name,
             min_exposure,
             'created_an_account' AS event,
             created AS event_timestamp,
             email AS event_email
      FROM reg_users
      WHERE to_unixtime(min_exposure) <= to_unixtime(created)
        AND cast(min_exposure AS DATE) >= cast(created AS DATE) - INTERVAL '28' DAY
      UNION ALL SELECT context_campaign_name,
                       min_exposure,
                       event,
                       original_timestamp AS event_timestamp,
                       context_traits_email AS event_email
      FROM reg_users
      WHERE to_unixtime(min_exposure) <= to_unixtime(original_timestamp)
        AND cast(min_exposure AS DATE) >= cast(original_timestamp AS DATE) - INTERVAL '28' DAY
      UNION ALL SELECT *
      FROM non_reg_users))

SELECT a.*,
       b.*
FROM cleaned_list a
LEFT JOIN mql_flag b ON a.event_email = b.email

"""

In [9]:
formatter = column_parser.Parser(query)
formatted = formatter.format_query(query)
print(formatted)

WITH reg_users AS
  (SELECT *
   FROM
     (SELECT a.*,
             b.*,
             c.*,
             d.*
      FROM
        (SELECT DISTINCT anonymous_id,
                         user_id
         FROM mapbox_customer_data.segment_identifies
         WHERE dt >= '2018-07-01'
           AND anonymous_id IS NOT NULL
           AND user_id IS NOT NULL ) a
      LEFT JOIN
        (SELECT id,
                email,
                created
         FROM mapbox_customer_data.accounts
         WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON a.user_id = b.id
      LEFT JOIN
        (SELECT anonymous_id AS anon_id_ad,
                context_campaign_name,
                min(TIMESTAMP) AS min_exposure
         FROM mapbox_customer_data.segment_pages
         WHERE dt >= '2018-07-01'
           AND context_campaign_name IS NOT NULL
         GROUP BY 1,
                  2) c ON a.anonymous_id = c.anon_id_ad
      LEFT JOIN
        (SELECT DISTINCT anonymous_id AS anon_id_even

In [10]:
analyzer = query_analyzer.Analyzer(query)
query_dict = analyzer.parse_query(query)

In [11]:
query_dict

[{'reg_users': [{'level_1_main': 'SELECT * WHERE context_campaign_name IS NOT NULL FROM no alias '},
   {'level_2_main': 'SELECT a.*,        b.*,        c.*,        d.* FROM a LEFT JOIN b ON a.user_id = b.id LEFT JOIN c ON a.anonymous_id = c.anon_id_ad LEFT JOIN d ON a.anonymous_id = d.anon_id_event '},
   {'a': "SELECT DISTINCT anonymous_id, user_id FROM mapbox_customer_data.segment_identifies WHERE dt >= '2018-07-01' AND anonymous_id IS NOT NULL AND user_id IS NOT NULL "},
   {'c': "SELECT anonymous_id AS anon_id_ad, context_campaign_name, min(TIMESTAMP) AS min_exposure FROM mapbox_customer_data.segment_pages WHERE dt >= '2018-07-01' AND context_campaign_name IS NOT NULL GROUP BY 1, 2"},
   {'b': "SELECT id, email, created FROM mapbox_customer_data.accounts WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY "},
   {'d': "SELECT DISTINCT anonymous_id AS anon_id_event, original_timestamp, event, context_traits_email FROM mapbox_customer_data.segment_tracks WHERE dt >= '2018-07-01

In [12]:
preprocess_list = unnest_query_list(query_dict)

In [13]:
preprocess_list

['SELECT * WHERE context_campaign_name IS NOT NULL FROM no alias ',
 'SELECT a.*,        b.*,        c.*,        d.* FROM a LEFT JOIN b ON a.user_id = b.id LEFT JOIN c ON a.anonymous_id = c.anon_id_ad LEFT JOIN d ON a.anonymous_id = d.anon_id_event ',
 "SELECT DISTINCT anonymous_id, user_id FROM mapbox_customer_data.segment_identifies WHERE dt >= '2018-07-01' AND anonymous_id IS NOT NULL AND user_id IS NOT NULL ",
 "SELECT anonymous_id AS anon_id_ad, context_campaign_name, min(TIMESTAMP) AS min_exposure FROM mapbox_customer_data.segment_pages WHERE dt >= '2018-07-01' AND context_campaign_name IS NOT NULL GROUP BY 1, 2",
 "SELECT id, email, created FROM mapbox_customer_data.accounts WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ",
 "SELECT DISTINCT anonymous_id AS anon_id_event, original_timestamp, event, context_traits_email FROM mapbox_customer_data.segment_tracks WHERE dt >= '2018-07-01' AND event LIKE 'submitted_%form' AND context_traits_email IS NOT NULL ",
 "SELECT cont

In [None]:
db_fields_1 = pd.DataFrame({'db_table': 'wbr.map_requests_by_account', 
            'all_columns': ['platform', 'mobile_os', 'service', 'service_metadata', 'service_metadata_version', 'account', 'num_requests', 'dt']})
db_fields_1


In [None]:
db_fields_2 = pd.DataFrame({'db_table': 'mapbox_customer_data.styles', 
            'all_columns': ['id', 'owner', 'metadata', 'sources']})
db_fields_2


In [None]:
db_fields_3 = pd.DataFrame({'db_table': 'sfdc.accounts', 
            'all_columns': ['dt', 'customer_tier_c', 'csm_c', 'name', 'mapbox_username_c', 'x18_digit_account_id_c']})
db_fields_3


In [None]:
db_fields_4 = pd.DataFrame({'db_table': 'sfdc.users', 
            'all_columns': ['dt', 'name', 'id']})
db_fields_4


In [None]:
df = db_fields_1.append(db_fields_2, ignore_index=True)

In [None]:
df = df.append(db_fields_3, ignore_index=True)
df = df.append(db_fields_4, ignore_index=True)

In [None]:
db_fields = df

In [None]:
db_fields

## given query

In [None]:
query = """"SELECT u.name,\n       b.customer_tier_c,\n       b.name,\n       m.account,\n       b.x18_digit_account_id_c,\n       s.id,\n       m.platform,\n       m.mobile_os,\n       m.num_requests,\n       Row_number() OVER(PARTITION BY s.id) row_\nFROM wbr.map_requests_by_account m\nINNER JOIN\n  (SELECT DISTINCT id\n   FROM mapbox_customer_data.styles\n   WHERE cast(dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n     AND sources LIKE '%mapbox-streets-v7%' ) s ON m.service_metadata_version = s.id\nLEFT JOIN\n  (SELECT customer_tier_c,\n          csm_c,\n          name,\n          mapbox_username_c,\n          x18_digit_account_id_c\n   FROM sfdc.accounts\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON m.account = b.mapbox_username_c\nLEFT JOIN\n  (SELECT name,\n          id\n   FROM sfdc.users\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) u ON b.csm_c = u.id\nWHERE cast(m.dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n  AND m.service_metadata = 'custom'\n  AND m.service = 'styles'\n  AND b.customer_tier_c IN ('Tier 0',\n                            'Tier 1',\n                            'Tier 2',\n                            'Tier 3',\n                            'Tier 4')"
"""

## transformation 1: format query

In [None]:
query = """SELECT api.name, acct.customer_tier_c, acct.name FROM api_requests_by_account api
LEFT JOIN accounts 
acct ON api.user_id = acct.customer_api_id
"""

In [None]:
formatter = column_parser.Parser(query)

In [None]:
formatted = formatter.format_query(query)
print(formatted)

## transformation 2: separate CTE's

In [None]:
query = """WITH a AS
  (SELECT DISTINCT anonymous_id,
                   user_id
   FROM customer_data.segment_identifies
   WHERE dt >= '2018-07-01'),
     b AS
  (SELECT id,
          email,
          created
   FROM customer_data.accounts)
SELECT a.*,
       b.*
FROM a
LEFT JOIN b ON a.user_id = b.id
WHERE context_campaign_name IS NOT NULL
"""

In [None]:
formatter = column_parser.Parser(query)
formatted = formatter.format_query(query)
print(formatted)

In [None]:
formatter = column_parser.Parser(query)
cte_query = formatter.parse_cte(query)
cte_query

In [None]:
cte_query.keys()

In [None]:
formatted = formatter.format_query(cte_query['main'])
cte_query = formatter.parse_cte(formatted)
cte_query

In [None]:
formatted = formatter.format_query(cte_query['main'])
cte_query = formatter.parse_cte(formatted)
cte_query

## transformation 3: match table aliases

In [None]:
query = """SELECT *
FROM api_requests.requests_by_account m
INNER JOIN mapbox_customer_data.styles s ON m.metadata_version = s.id
LEFT JOIN sfdc.users u ON m.csm = u.id
"""

In [None]:
formatter = column_parser.Parser(query)
formatted = formatter.format_query(query)

In [None]:
print(formatted)

In [None]:
table_alias_mapping = formatter.get_table_names(formatted.split('\n'))
table_alias_mapping

## transformation 4: find columns

In [None]:
fields = formatter.match_queried_fields(query, db_fields)

In [None]:
pd.DataFrame(fields).sort_values(by=['database_name', 'table_name', 'column_name'])

## bonus transformation: upload other query metadata (such as timestamp, user)