In [25]:
from sqlanalyzer import column_parser
import re
import json
import pandas as pd

### debug

In [389]:
query = """SELECT
      b.*
      FROM
      (
        SELECT
        MAX(dt) AS dt
        FROM
        analytics.service_endpoint_mapping
      ) a
      LEFT JOIN sfdc.accounts sfdc_accounts ON sfdc_accounts.id = b.id
      INNER JOIN
      analytics.service_endpoint_mapping b
      ON
      a.dt = b.dt
      AND a.id = b.id
      LEFT JOIN (
        SELECT
        account,
        CONCAT_WS(',', COLLECT_SET(LOWER(service))) AS service
        FROM
        web_mobile_studio
        GROUP BY
        account
      ) c
      ON
      a.account = c.account AND c.service = 'studio'
      WHERE a.id = '123'
      AND b.id = '234'
      GROUP BY 1,2,3
      ORDER BY 3,4
"""


In [390]:
print(query)

SELECT
      b.*
      FROM
      (
        SELECT
        MAX(dt) AS dt
        FROM
        analytics.service_endpoint_mapping
      ) a
      LEFT JOIN sfdc.accounts sfdc_accounts ON sfdc_accounts.id = b.id
      INNER JOIN
      analytics.service_endpoint_mapping b
      ON
      a.dt = b.dt
      AND a.id = b.id
      LEFT JOIN (
        SELECT
        account,
        CONCAT_WS(',', COLLECT_SET(LOWER(service))) AS service
        FROM
        web_mobile_studio
        GROUP BY
        account
      ) c
      ON
      a.account = c.account AND c.service = 'studio'
      WHERE a.id = '123'
      AND b.id = '234'
      GROUP BY 1,2,3
      ORDER BY 3,4



In [391]:
def within(num, rng):
    if num >= min(rng) and num <= max(rng) and min(rng) < max(rng): return 1
    else: return 0


def is_cte(query):
    return query.startswith('WITH')


def clean_dict(query_dict):

    for k,v in query_dict.items(): 
        if isinstance(v, dict) and len(v.keys()) == 1 and 'main' in v.keys():
            query_dict[k] = v['main']
            
    return query_dict


In [392]:
def main_query(query_list, sub_query_pos):

    l = []
    for i in range(len(query_list)): 
        count = 0
        for pair in sorted(sub_query_pos):
            count += within(i, pair)
        if count == 0:
            l.append(i)

    return l


def _get_joins_pos(query_list):

    pos_delete, pos_where = [len(query_list)-1], len(query_list)
    pos_join = []
    for i, line in enumerate(query_list):
        if line.startswith('ORDER') or line.startswith('GROUP'):
            pos_delete.append(i)
        elif line.startswith('FROM') and len(line.split(' ')) > 1:
            pos_join.append(i)

        elif line.startswith('FROM') and len(line.split(' ')) == 1:
            pos_join.append(i)
            for i2,line2 in enumerate(query_list[i+1:]):
                if line2.startswith(' '): pos_join.append(i2+i+1)
                else: break

        elif line.startswith('WHERE'):
            pos_where = i
        elif line.startswith('LEFT JOIN') or line.startswith('INNER JOIN') or line.startswith('FULL OUTER JOIN') or line.startswith('RIGHT JOIN'):
            pos_join.append(i)
            for i3,line3 in enumerate(query_list[i+1:]):
                if line3.startswith(' '): pos_join.append(i3+i+1)
                else: break

    if min(pos_delete) == len(query_list)-1:
        pos_join.append(min(pos_delete))
    else:
        pass

    return sorted(list(set(pos_join))), pos_where


def _get_alias_pos(query_list, pos_join, pos_where):

    pos_join_list = iter(pos_join)
    next(pos_join_list)
    alias_pos = []

    if query_list[pos_join[0]].startswith('FROM'):
        alias_pos.append(pos_join[0])

    for i in range(len(pos_join)-1):
        if i < len(pos_join)-2 and pos_join[i] < pos_where:
            end_pos = next(pos_join_list)-1
            alias_pos.append(end_pos-1)

        elif pos_join[-1] >= pos_where:
            end_pos = next(pos_join_list)-1
            alias_pos.append(pos_where - 1)

        else:
            end_pos = pos_join[-1]
            alias_pos.append(pos_where-1)

    alias_pos = sorted(list(set(alias_pos)))

    return alias_pos


def _parse_sub_query(query_list, sub_query_pos):

    sub_query = {}
    keep = []
    for _, sub_pos in enumerate(sub_query_pos):
        alias = query_list[sub_pos[1]]
        query = query_list[sub_pos[0]: sub_pos[1]]

        try:
            alias_list_rev = alias.split(' ')[::-1]
            if alias_list_rev[0][-1] != ')':
                alias_index = alias_list_rev.index('ON')
                alias = alias_list_rev[alias_index+1]

                if alias_list_rev[alias_index+2] == 'AS':
                    keep.append(' '.join(alias_list_rev[:alias_index+3][::-1]))
                    del alias_list_rev[:alias_index+3]

                else:
                    keep.append(' '.join(alias_list_rev[:alias_index+2][::-1]))
                    del alias_list_rev[:alias_index+2]

                query.append(' '.join(alias_list_rev[::-1]).rstrip(r'\)').lstrip(' '))

            else:
                alias_list_rev[0] = alias_list_rev[0].rstrip(r'\)')
                alias = 'no alias'
                query.append(' '.join(alias_list_rev[::-1]))

        except:
            query.append(' '.join(alias.split(' ')[:-1]).rstrip(r'\)').lstrip(' '))
            alias = alias.split(' ')[-1]

        trans_query = ' '.join(query).lstrip(r' \(').lstrip(' FROM')

        if trans_query == '':
            sub_query = {}
        else:
            sub_query[alias] = trans_query

    return sub_query, keep


def delevel(query_list):

    sub_query = {}
    pos_join, pos_where = _get_joins_pos(query_list)
    alias_pos = _get_alias_pos(query_list, pos_join, pos_where)
    sub_query_pos = list(zip(pos_join, alias_pos))
    sub_query, keep = _parse_sub_query(query_list, sub_query_pos)
    main_query_pos = main_query(query_list, sub_query_pos)
    if main_query_pos != []:
        sub_query['main'] = '\n'.join([query_list[p] for p in main_query_pos])
    sub_query['main'] = sub_query['main'] + ' ' + '\n'.join(keep)

    return sub_query


def has_child(formatted_query):

    count = 0
    deleveled_list = delevel(formatted_query.split('\n'))
    if len(deleveled_list.keys()) > 1:
        for _,v in deleveled_list.items():
            if v != {}: count += 1

    return count != 0




In [393]:
# def extract_query_dict(query):

formatter = column_parser.Parser(query)
formatted_query = formatter.format_query(query)
query_list_0 = formatted_query.split('\n')
query_dict = {}


In [394]:
for i,line in enumerate(query_list_0): print(i, line)

0 SELECT b.*
1 FROM
2   (SELECT MAX(dt) AS dt
3    FROM analytics.service_endpoint_mapping) a
4 LEFT JOIN sfdc.accounts sfdc_accounts ON sfdc_accounts.id = b.id
5 INNER JOIN analytics.service_endpoint_mapping b ON a.dt = b.dt
6 AND a.id = b.id
7 LEFT JOIN
8   (SELECT account,
9           CONCAT_WS(',', COLLECT_SET(LOWER(service))) AS service
10    FROM web_mobile_studio
11    GROUP BY account) c ON a.account = c.account
12 AND c.service = 'studio'
13 WHERE a.id = '123'
14   AND b.id = '234'
15 GROUP BY 1,
16          2,
17          3
18 ORDER BY 3,
19          4


In [395]:
# sub_query = delevel(query_list_0)
# query_dict = sub_query
# query_dict

In [396]:
query_list = query_list_0

In [397]:
# delevel
sub_query = {}
pos_join, pos_where = _get_joins_pos(query_list)
pos_join, pos_where

([1, 2, 3, 4, 5, 7, 8, 9, 10, 11], 13)

In [398]:
# alias_pos = _get_alias_pos(query_list, pos_join, pos_where)
# alias_pos

## new: start get_joins

In [410]:
for i, l in enumerate(query_list): print(i, l)

0 SELECT b.*
1 FROM
2   (SELECT MAX(dt) AS dt
3    FROM analytics.service_endpoint_mapping) a
4 LEFT JOIN sfdc.accounts sfdc_accounts ON sfdc_accounts.id = b.id
5 INNER JOIN analytics.service_endpoint_mapping b ON a.dt = b.dt
6 AND a.id = b.id
7 LEFT JOIN
8   (SELECT account,
9           CONCAT_WS(',', COLLECT_SET(LOWER(service))) AS service
10    FROM web_mobile_studio
11    GROUP BY account) c ON a.account = c.account
12 AND c.service = 'studio'
13 WHERE a.id = '123'
14   AND b.id = '234'
15 GROUP BY 1,
16          2,
17          3
18 ORDER BY 3,
19          4


In [413]:
pos_delete, pos_where = [len(query_list)-1], len(query_list)

for i, line in enumerate(query_list):
    if line.startswith('ORDER') or line.startswith('GROUP'):
        pos_delete.append(i)    
    elif line.startswith('WHERE'):
        pos_where = i

end_of_query = min(pos_delete) 


In [443]:
join_dict = {}
copy_query_list = query_list.copy()
main = next((s for s in copy_query_list if not s.startswith('FROM')), 'end of query')
main_pos = copy_query_list.index(main)
main_query = copy_query_list[:main_pos+1]
main_query.extend(copy_query_list[pos_where:end_of_query])
del copy_query_list[:main_pos+1]
del copy_query_list[(pos_where-main_pos-1):]
           

In [446]:
sub_join = []
for i, line in enumerate(copy_query_list): 
    
    if line.startswith('FROM'):
        sub_join.append(line)
        del copy_query_list[:i+1]
        first_join = next((s for s in copy_query_list if not s.startswith(' ')), 'end of query')
        first_join_pos = copy_query_list.index(first_join)
        sub_join.extend(copy_query_list[:first_join_pos])
        del copy_query_list[:first_join_pos+1]
        
join_dict['join_1'] = ' '.join(sub_join)


In [444]:
main_query

['SELECT b.*', "WHERE a.id = '123'", "  AND b.id = '234'"]

In [445]:
copy_query_list

['INNER JOIN analytics.service_endpoint_mapping b ON a.dt = b.dt',
 'AND a.id = b.id',
 'LEFT JOIN',
 '  (SELECT account,',
 "          CONCAT_WS(',', COLLECT_SET(LOWER(service))) AS service",
 '   FROM web_mobile_studio',
 '   GROUP BY account) c ON a.account = c.account',
 "AND c.service = 'studio'"]

In [409]:
join_dict

{'join_1': 'FROM   (SELECT MAX(dt) AS dt    FROM analytics.service_endpoint_mapping) a'}

## end get_joins

In [319]:
for i,line in enumerate(query_list_0): print(i, line)

0 SELECT b.*
1 FROM
2   (SELECT MAX(dt) AS dt
3    FROM analytics.service_endpoint_mapping) a
4 LEFT JOIN sfdc.accounts sfdc_accounts ON sfdc_accounts.id = b.id
5 INNER JOIN analytics.service_endpoint_mapping b ON a.dt = b.dt
6 AND a.id = b.id
7 LEFT JOIN
8   (SELECT account,
9           CONCAT_WS(',', COLLECT_SET(LOWER(service))) AS service
10    FROM web_mobile_studio
11    GROUP BY account) c ON a.account = c.account
12 AND c.service = 'studio'


In [320]:
sub_query_pos

[(1, 1), (2, 4), (3, 5)]

In [318]:
sub_query_pos = list(zip(pos_join, alias_pos))
sub_query, keep = _parse_sub_query(query_list, sub_query_pos)
sub_query, keep, sub_query_pos


({'sfdc_accounts': 'SELECT MAX(dt) AS dt    FROM analytics.service_endpoint_mapping) a LEFT JOIN sfdc.accounts',
  'b': 'analytics.service_endpoint_mapping) a LEFT JOIN sfdc.accounts sfdc_accounts ON sfdc_accounts.id = b.id INNER JOIN analytics.service_endpoint_mapping'},
 ['sfdc_accounts ON sfdc_accounts.id = b.id', 'b ON a.dt = b.dt'],
 [(1, 1), (2, 4), (3, 5)])

In [203]:
main_query_pos = main_query(query_list, sub_query_pos)
main_query_pos

[0, 1, 4]

In [204]:
if main_query_pos != []:
    sub_query['main'] = '\n'.join([query_list[p] for p in main_query_pos])
sub_query['main'] = sub_query['main'] + ' ' + '\n'.join(keep)

In [205]:
sub_query

{'a': 'SELECT MAX(dt) AS dt FROM analytics.service_endpoint_mapping',
 'b': 'SELECT *    FROM sfdc.accounts) AS sfdc_accounts ON sfdc_accounts.dt = a.dt AND sfdc_accounts.id = a.acct_id LEFT JOIN analytics.service_endpoint_mapping',
 'sfdc_accounts': 'sfdc.accounts',
 'main': 'SELECT b.*\nFROM\nINNER JOIN b ON a.dt = b.dt\nAS sfdc_accounts ON sfdc_accounts.dt = a.dt'}