In [2]:
from sqlanalyzer import column_parser
import pandas as pd

## given db metadata

In [4]:
db_fields_1 = pd.DataFrame({'db_table': 'wbr.map_requests_by_account', 
            'all_columns': ['platform', 'mobile_os', 'service', 'service_metadata', 'service_metadata_version', 'account', 'num_requests', 'dt']})
db_fields_1


Unnamed: 0,db_table,all_columns
0,wbr.map_requests_by_account,platform
1,wbr.map_requests_by_account,mobile_os
2,wbr.map_requests_by_account,service
3,wbr.map_requests_by_account,service_metadata
4,wbr.map_requests_by_account,service_metadata_version
5,wbr.map_requests_by_account,account
6,wbr.map_requests_by_account,num_requests
7,wbr.map_requests_by_account,dt


In [5]:
db_fields_2 = pd.DataFrame({'db_table': 'mapbox_customer_data.styles', 
            'all_columns': ['id', 'owner', 'metadata', 'sources']})
db_fields_2


Unnamed: 0,db_table,all_columns
0,mapbox_customer_data.styles,id
1,mapbox_customer_data.styles,owner
2,mapbox_customer_data.styles,metadata
3,mapbox_customer_data.styles,sources


In [9]:
db_fields_3 = pd.DataFrame({'db_table': 'sfdc.accounts', 
            'all_columns': ['dt', 'customer_tier_c', 'csm_c', 'name', 'mapbox_username_c', 'x18_digit_account_id_c']})
db_fields_3


Unnamed: 0,db_table,all_columns
0,sfdc.accounts,dt
1,sfdc.accounts,customer_tier_c
2,sfdc.accounts,csm_c
3,sfdc.accounts,name
4,sfdc.accounts,mapbox_username_c
5,sfdc.accounts,x18_digit_account_id_c


In [8]:
db_fields_4 = pd.DataFrame({'db_table': 'sfdc.users', 
            'all_columns': ['dt', 'name', 'id']})
db_fields_4


Unnamed: 0,db_table,all_columns
0,sfdc.users,dt
1,sfdc.users,name
2,sfdc.users,id


In [12]:
df = db_fields_1.append(db_fields_2, ignore_index=True)

In [13]:
df = df.append(db_fields_3, ignore_index=True)
df = df.append(db_fields_4, ignore_index=True)

In [15]:
db_fields = df

In [28]:
db_fields

Unnamed: 0,db_table,all_columns
0,wbr.map_requests_by_account,platform
1,wbr.map_requests_by_account,mobile_os
2,wbr.map_requests_by_account,service
3,wbr.map_requests_by_account,service_metadata
4,wbr.map_requests_by_account,service_metadata_version
5,wbr.map_requests_by_account,account
6,wbr.map_requests_by_account,num_requests
7,wbr.map_requests_by_account,dt
8,mapbox_customer_data.styles,id
9,mapbox_customer_data.styles,owner


## given query

In [16]:
query = """"SELECT u.name,\n       b.customer_tier_c,\n       b.name,\n       m.account,\n       b.x18_digit_account_id_c,\n       s.id,\n       m.platform,\n       m.mobile_os,\n       m.num_requests,\n       Row_number() OVER(PARTITION BY s.id) row_\nFROM wbr.map_requests_by_account m\nINNER JOIN\n  (SELECT DISTINCT id\n   FROM mapbox_customer_data.styles\n   WHERE cast(dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n     AND sources LIKE '%mapbox-streets-v7%' ) s ON m.service_metadata_version = s.id\nLEFT JOIN\n  (SELECT customer_tier_c,\n          csm_c,\n          name,\n          mapbox_username_c,\n          x18_digit_account_id_c\n   FROM sfdc.accounts\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON m.account = b.mapbox_username_c\nLEFT JOIN\n  (SELECT name,\n          id\n   FROM sfdc.users\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) u ON b.csm_c = u.id\nWHERE cast(m.dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n  AND m.service_metadata = 'custom'\n  AND m.service = 'styles'\n  AND b.customer_tier_c IN ('Tier 0',\n                            'Tier 1',\n                            'Tier 2',\n                            'Tier 3',\n                            'Tier 4')"
"""

## transformation 1: format query

In [18]:
formatter = column_parser.Parser(query)

In [19]:
formatted = formatter.format_query(query)
print(formatted)

SELECT u.name,
       b.customer_tier_c,
       b.name,
       m.account,
       b.x18_digit_account_id_c,
       s.id,
       m.platform,
       m.mobile_os,
       m.num_requests,
       Row_number() OVER(PARTITION BY s.id) row_
FROM wbr.map_requests_by_account m
INNER JOIN
  (SELECT DISTINCT id
   FROM mapbox_customer_data.styles
   WHERE cast(dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY
     AND sources LIKE '%mapbox-streets-v7%' ) s ON m.service_metadata_version = s.id
LEFT JOIN
  (SELECT customer_tier_c,
          csm_c,
          name,
          mapbox_username_c,
          x18_digit_account_id_c
   FROM sfdc.accounts
   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON m.account = b.mapbox_username_c
LEFT JOIN
  (SELECT name,
          id
   FROM sfdc.users
   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) u ON b.csm_c = u.id
WHERE cast(m.dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY
  AND m.service_metadata = 'custom'
  AND m.service = 'styles'
 

## transformation 2: separate CTE's

In [20]:
cte_query = formatter.parse_cte(formatted)
cte_query

{'main': "SELECT u.name,\n       b.customer_tier_c,\n       b.name,\n       m.account,\n       b.x18_digit_account_id_c,\n       s.id,\n       m.platform,\n       m.mobile_os,\n       m.num_requests,\n       Row_number() OVER(PARTITION BY s.id) row_\nFROM wbr.map_requests_by_account m\nINNER JOIN\n  (SELECT DISTINCT id\n   FROM mapbox_customer_data.styles\n   WHERE cast(dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n     AND sources LIKE '%mapbox-streets-v7%' ) s ON m.service_metadata_version = s.id\nLEFT JOIN\n  (SELECT customer_tier_c,\n          csm_c,\n          name,\n          mapbox_username_c,\n          x18_digit_account_id_c\n   FROM sfdc.accounts\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON m.account = b.mapbox_username_c\nLEFT JOIN\n  (SELECT name,\n          id\n   FROM sfdc.users\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) u ON b.csm_c = u.id\nWHERE cast(m.dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n  AND m.service_metadata

In [21]:
cte_query.keys()

dict_keys(['main'])

In [22]:
formatted = formatter.format_query(cte_query['main'])
cte_query = formatter.parse_cte(formatted)
cte_query

{'main': "SELECT u.name,\n       b.customer_tier_c,\n       b.name,\n       m.account,\n       b.x18_digit_account_id_c,\n       s.id,\n       m.platform,\n       m.mobile_os,\n       m.num_requests,\n       Row_number() OVER(PARTITION BY s.id) row_\nFROM wbr.map_requests_by_account m\nINNER JOIN\n  (SELECT DISTINCT id\n   FROM mapbox_customer_data.styles\n   WHERE cast(dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n     AND sources LIKE '%mapbox-streets-v7%' ) s ON m.service_metadata_version = s.id\nLEFT JOIN\n  (SELECT customer_tier_c,\n          csm_c,\n          name,\n          mapbox_username_c,\n          x18_digit_account_id_c\n   FROM sfdc.accounts\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON m.account = b.mapbox_username_c\nLEFT JOIN\n  (SELECT name,\n          id\n   FROM sfdc.users\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) u ON b.csm_c = u.id\nWHERE cast(m.dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n  AND m.service_metadata

In [23]:
formatted = formatter.format_query(cte_query['main'])
cte_query = formatter.parse_cte(formatted)
cte_query

{'main': "SELECT u.name,\n       b.customer_tier_c,\n       b.name,\n       m.account,\n       b.x18_digit_account_id_c,\n       s.id,\n       m.platform,\n       m.mobile_os,\n       m.num_requests,\n       Row_number() OVER(PARTITION BY s.id) row_\nFROM wbr.map_requests_by_account m\nINNER JOIN\n  (SELECT DISTINCT id\n   FROM mapbox_customer_data.styles\n   WHERE cast(dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n     AND sources LIKE '%mapbox-streets-v7%' ) s ON m.service_metadata_version = s.id\nLEFT JOIN\n  (SELECT customer_tier_c,\n          csm_c,\n          name,\n          mapbox_username_c,\n          x18_digit_account_id_c\n   FROM sfdc.accounts\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON m.account = b.mapbox_username_c\nLEFT JOIN\n  (SELECT name,\n          id\n   FROM sfdc.users\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) u ON b.csm_c = u.id\nWHERE cast(m.dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n  AND m.service_metadata

## transformation 3: match table aliases

In [24]:
table_alias_mapping = formatter.get_table_names(formatted.split('\n'))
table_alias_mapping

{'m': 'wbr.map_requests_by_account',
 'mapbox_customer_data.styles': 'mapbox_customer_data.styles',
 'sfdc.accounts': 'sfdc.accounts',
 'sfdc.users': 'sfdc.users'}

## transformation 4: find columns

In [29]:
fields = formatter.match_queried_fields(query, db_fields)

In [33]:
pd.DataFrame(fields).sort_values(by=['database_name', 'table_name', 'column_name'])

Unnamed: 0,database_name,table_name,column_name
17,mapbox_customer_data,styles,id
13,mapbox_customer_data,styles,sources
2,sfdc,accounts,csm_c
3,sfdc,accounts,customer_tier_c
6,sfdc,accounts,dt
15,sfdc,accounts,mapbox_username_c
4,sfdc,accounts,name
8,sfdc,users,dt
9,sfdc,users,id
16,sfdc,users,name


## bonus transformation: upload other query metadata (such as timestamp, user)