In [17]:
import json
import pandas as pd
from sqlanalyzer import column_parser, query_analyzer

In [15]:
def extract_subquery_fields(query, db_fields, **kwargs):
    formatter = column_parser.Parser(query)
    formatted = formatter.format_query(query)
    fields = formatter.match_queried_fields(formatted, db_fields, **kwargs)
    return fields


def unnest_query_list(query_list):
    preprocess_list = []
    
    for q in query_list:
        for _, query in q.items():
            
            if isinstance(query, str):
                preprocess_list.append(query)
            else:
                
                for sub_q in query:
                    sub_list = []
                    for _, sub_query in sub_q.items():
                        
                        if isinstance(sub_query, str):
                            sub_list.append(sub_query)
                            
                        else:
                            for sub_sub_q in sub_query:
                                for _, sub_sub_query in sub_sub_q.items():
                                    
                                    if isinstance(sub_sub_query, str):
                                        sub_list.append(sub_sub_query)
                                        
                    preprocess_list.extend(sub_list)
                    
    return preprocess_list
    

In [3]:
# query = open('active_devs.sql').read()
query = """SELECT rrr.*,
       CASE
           WHEN entexc.sfdc_acct_id IS NOT NULL THEN TRUE
           ELSE FALSE
       END AS ent_exception,
       CASE
           WHEN pfree.mbx_acct_id IS NOT NULL THEN TRUE
           ELSE FALSE
       END AS paygo_free,
       meta.sku_type
FROM analytics.rack_rate_revenue rrr
LEFT JOIN enterprise_exception entexc ON rrr.sfdc_acct_id = entexc.sfdc_acct_id
AND rrr.mbx_acct_lvl = 'enterprise'
LEFT JOIN paygo_free pfree ON rrr.mbx_acct_id = pfree.mbx_acct_id
AND pfree.date_month BETWEEN DATE_ADD('month', -1, DATE(vdate)) AND DATE_ADD('day', -1, DATE(vdate))
LEFT JOIN analytics.sku_metadata meta ON rrr.sku_id = meta.sku_id
WHERE rrr.dt = '2020-06-30'
"""
formatter = column_parser.Parser(query)
formatted_query = formatter.format_query(query)
cte_queries = formatter.parse_cte(formatted_query)


In [15]:
with open('./data.json', 'r') as f:
    query_dict = json.load(f)

In [16]:
print(json.dumps(query_dict, indent=2), '\n\n\n')

{
  "analytics_service_endpoint_mapping": {
    "b": {
      "a": "analytics.service_endpoint_mapping",
      "analytics.service_endpoint_mapping": "INNER JOIN",
      "main": "SELECT MAX(dt) AS dt\nFROM analytics.service_endpoint_mapping) a\nINNER JOIN analytics.service_endpoint_mapping "
    },
    "main": "SELECT b.*\nFROM b ON a.dt = b.dt),"
  },
  "web_dev": {
    "s": "sku.daily_by_account",
    "main": "SELECT s.dt,\n       COALESCE(em.platform, 'unknown') AS platform,\n       COALESCE(service_org, 'other') AS service,\n       account\nFROM sku.daily_by_account s\nINNER JOIN mapbox_customer_data.accounts a ON s.account = a.id\nAND a.dt = '{run_date}'\nLEFT JOIN analytics_service_endpoint_mapping em ON s.sku_id = em.sku_id\nAND em.in_sku IS NOT NULL\nAND em.parent_sku IS NULL\nWHERE s.dt BETWEEN DATE_SUB('{run_date}', 29) AND '{run_date}'\nGROUP BY 1,\n         2,\n         3,\n         4), "
  },
  "mobile_dev": {
    "main": "SELECT dt,\n       'mobile' AS platform,\n       CAS

In [17]:
query_dict.keys()

dict_keys(['analytics_service_endpoint_mapping', 'web_dev', 'mobile_dev', 'studio_dev', 'web_mobile_studio', 'mau_cube', 'wau_cube', 'dau_cube', 'main'])

In [5]:
print(cte_queries['main_query'])

SELECT rrr.*,
       CASE
           WHEN entexc.sfdc_acct_id IS NOT NULL THEN TRUE
           ELSE FALSE
       END AS ent_exception,
       CASE
           WHEN pfree.mbx_acct_id IS NOT NULL THEN TRUE
           ELSE FALSE
       END AS paygo_free,
       meta.sku_type
FROM analytics.rack_rate_revenue rrr
LEFT JOIN enterprise_exception entexc ON rrr.sfdc_acct_id = entexc.sfdc_acct_id
AND rrr.mbx_acct_lvl = 'enterprise'
LEFT JOIN paygo_free pfree ON rrr.mbx_acct_id = pfree.mbx_acct_id
AND pfree.date_month BETWEEN DATE_ADD('month', -1, DATE(vdate)) AND DATE_ADD('day', -1, DATE(vdate))
LEFT JOIN analytics.sku_metadata meta ON rrr.sku_id = meta.sku_id
WHERE rrr.dt = '2020-06-30'


## metastore 

In [8]:
db_fields_1 = pd.DataFrame({'db_table': 'analytics.rack_rate_revenue', 
            'all_columns': ['vdate',
 'mbx_acct_id',
 'mbx_acct_lvl',
 'is_apa',
 'sfdc_acct_id',
 'sfdc_acct_name',
 'sfdc_acct_owner',
 'pod',
 'service_group',
 'sku_id',
 'subunits',
 'mtd_usage',
 'daily_revenue',
 'mtd_revenue',
 'discount_rate',
 'daily_revenue_w_discount',
 'mtd_revenue_w_discount',
 'dt']})


In [10]:
db_fields_2 = pd.DataFrame({'db_table': 'analytics.sku_metadata', 
            'all_columns': ['sku_id', 'sku_ty', 'sku_na']})


In [11]:
db_fields = db_fields_1.append(db_fields_2, ignore_index=True)
db_fields

Unnamed: 0,db_table,all_columns
0,analytics.rack_rate_revenue,vdate
1,analytics.rack_rate_revenue,mbx_acct_id
2,analytics.rack_rate_revenue,mbx_acct_lvl
3,analytics.rack_rate_revenue,is_apa
4,analytics.rack_rate_revenue,sfdc_acct_id
5,analytics.rack_rate_revenue,sfdc_acct_name
6,analytics.rack_rate_revenue,sfdc_acct_owner
7,analytics.rack_rate_revenue,pod
8,analytics.rack_rate_revenue,service_group
9,analytics.rack_rate_revenue,sku_id


In [18]:
raw_query = cte_queries['main_query']
formatter = column_parser.Parser(raw_query)
formatted = formatter.format_query(raw_query)
analyzer = query_analyzer.Analyzer(formatted)
query_dict = analyzer.parse_query(formatted)
preprocess_list = unnest_query_list(query_dict)

In [19]:
preprocess_list

["SELECT rrr.*,\n       CASE\n           WHEN entexc.sfdc_acct_id IS NOT NULL THEN TRUE\n           ELSE FALSE\n       END AS ent_exception,\n       CASE\n           WHEN pfree.mbx_acct_id IS NOT NULL THEN TRUE\n           ELSE FALSE\n       END AS paygo_free,\n       meta.sku_type\nFROM analytics.rack_rate_revenue rrr\nLEFT JOIN enterprise_exception entexc ON rrr.sfdc_acct_id = entexc.sfdc_acct_id\nAND rrr.mbx_acct_lvl = 'enterprise'\nLEFT JOIN paygo_free pfree ON rrr.mbx_acct_id = pfree.mbx_acct_id\nAND pfree.date_month BETWEEN DATE_ADD('month', -1, DATE(vdate)) AND DATE_ADD('day', -1, DATE(vdate))\nLEFT JOIN analytics.sku_metadata meta ON rrr.sku_id = meta.sku_id\nWHERE rrr.dt = '2020-06-30'"]

In [20]:
extract_subquery_fields(preprocess_list[0], db_fields)

TypeError: 'set' object is not subscriptable