In [1]:
from sqlanalyzer import column_parser, unbundle
from sqlanalyzer.unbundle import *
import sqlparse
import re
import json
import pandas as pd
import time


def extract_subquery_fields(query, db_fields):
    formatter = column_parser.Parser(query)
    formatted = formatter.format_query(query)
    fields = formatter.match_queried_fields(formatted, db_fields)
    return fields


def compile_queried_cols(query_dict, df):
    all_cols = []
    for _,v in query_dict.items():
        if isinstance(v, dict):
            for _,v1 in v.items():
                all_cols.extend(extract_subquery_fields(v1, df))
        else:
            all_cols.extend(extract_subquery_fields(v, df))
    return all_cols

In [2]:
query = open('active_devs.sql').read()

formatter = column_parser.Parser(query)
formatted_query = formatter.format_query(query)
query_list = formatted_query.split('\n')

unbundled = unbundle.Unbundle(query)


In [3]:
if is_cte(formatted_query):
    cte_dict = formatter.parse_cte(formatted_query)
    final_dict = {}
    for alias, query in cte_dict.items():
        formatter = column_parser.Parser(query)
        formatted_query = formatter.format_query(query)
        try:
            final_dict[alias] = unbundled.extract_query_dict(formatted_query)
        except:
            final_dict[alias] = formatted_query

else:
    final_dict = unbundled.extract_query_dict(query)
        

In [4]:
cte_dict.keys()

dict_keys(['analytics_service_endpoint_mapping', 'web_dev', 'mobile_dev', 'studio_dev', 'web_mobile_studio', 'mau_cube', 'wau_cube', 'dau_cube', 'main'])

In [5]:
# for alias, query in cte_dict.items():
query = cte_dict['web_dev']
alias = 'web_dev'
formatter = column_parser.Parser(query)
formatted_query = formatter.format_query(query)
try:
    final_dict[alias] = unbundled.extract_query_dict(formatted_query)
except:
    final_dict[alias] = formatted_query
    

In [6]:
formatted_query

"SELECT s.dt,\n       COALESCE(em.platform, 'unknown') AS platform,\n       COALESCE(service_org, 'other') AS service,\n       account\nFROM sku.daily_by_account s\nINNER JOIN mapbox_customer_data.accounts a ON s.account = a.id\nAND a.dt = '{run_date}'\nLEFT JOIN analytics_service_endpoint_mapping em ON s.sku_id = em.sku_id\nAND em.in_sku IS NOT NULL\nAND em.parent_sku IS NULL\nWHERE s.dt BETWEEN DATE_SUB('{run_date}', 29) AND '{run_date}'\nGROUP BY 1,\n         2,\n         3,\n         4),"

In [7]:
query_list = formatted_query.split('\n')
query_list

['SELECT s.dt,',
 "       COALESCE(em.platform, 'unknown') AS platform,",
 "       COALESCE(service_org, 'other') AS service,",
 '       account',
 'FROM sku.daily_by_account s',
 'INNER JOIN mapbox_customer_data.accounts a ON s.account = a.id',
 "AND a.dt = '{run_date}'",
 'LEFT JOIN analytics_service_endpoint_mapping em ON s.sku_id = em.sku_id',
 'AND em.in_sku IS NOT NULL',
 'AND em.parent_sku IS NULL',
 "WHERE s.dt BETWEEN DATE_SUB('{run_date}', 29) AND '{run_date}'",
 'GROUP BY 1,',
 '         2,',
 '         3,',
 '         4),']

In [8]:
for i, l in enumerate(query_list): print(i, l)

0 SELECT s.dt,
1        COALESCE(em.platform, 'unknown') AS platform,
2        COALESCE(service_org, 'other') AS service,
3        account
4 FROM sku.daily_by_account s
5 INNER JOIN mapbox_customer_data.accounts a ON s.account = a.id
6 AND a.dt = '{run_date}'
7 LEFT JOIN analytics_service_endpoint_mapping em ON s.sku_id = em.sku_id
8 AND em.in_sku IS NOT NULL
9 AND em.parent_sku IS NULL
10 WHERE s.dt BETWEEN DATE_SUB('{run_date}', 29) AND '{run_date}'
11 GROUP BY 1,
12          2,
13          3,
14          4),


In [20]:
pos_delete, pos_where = [len(query_list)-1], len(query_list)
pos_join = []
for i, line in enumerate(query_list):
    if line.startswith('ORDER') or line.startswith('GROUP'):
        pos_delete.append(i)
    elif line.startswith('FROM') and len(line.split(' ')) > 1:
        pos_join.append(i)
        
    elif line.startswith('FROM') and len(line.split(' ')) == 1:
        j = 0
        for i2,line2 in enumerate(query_list[i+1:]):
            if line2.startswith(' '): j += 1
            else: break
        pos_join.append(j+i)
        
    elif line.startswith('WHERE'):
        pos_where = i
    elif line.startswith('LEFT JOIN') or line.startswith('INNER JOIN') or line.startswith('FULL OUTER JOIN') or line.startswith('RIGHT JOIN'):
        pos_join.append(i)

if min(pos_delete) == len(query_list)-1:
    pos_join.append(min(pos_delete))
else:
    pass

In [21]:
pos_join, pos_where, pos_delete

([4, 5, 7], 10, [14, 11])

In [13]:
alias_pos = unbundled._get_alias_pos(query_list, pos_join, pos_where)
sub_query_pos = list(zip(pos_join, alias_pos))
sub_query, keep = unbundled._parse_sub_query(query_list, sub_query_pos)
main_query_pos = unbundled.main_query(query_list, sub_query_pos)
if main_query_pos != []:
    sub_query['main'] = '\n'.join([query_list[p] for p in main_query_pos])
sub_query['main'] = sub_query['main'] + ' ' + '\n'.join(keep)


In [14]:
sub_query

{'s': 'sku.daily_by_account',
 'main': "SELECT s.dt,\n       COALESCE(em.platform, 'unknown') AS platform,\n       COALESCE(service_org, 'other') AS service,\n       account\nFROM sku.daily_by_account s\nINNER JOIN mapbox_customer_data.accounts a ON s.account = a.id\nAND a.dt = '{run_date}'\nLEFT JOIN analytics_service_endpoint_mapping em ON s.sku_id = em.sku_id\nAND em.in_sku IS NOT NULL\nAND em.parent_sku IS NULL\nWHERE s.dt BETWEEN DATE_SUB('{run_date}', 29) AND '{run_date}'\nGROUP BY 1,\n         2,\n         3,\n         4), "}