In [1]:
from sqlanalyzer import column_parser
import pandas as pd

## given db metadata

In [None]:
db_fields_1 = pd.DataFrame({'db_table': 'wbr.map_requests_by_account', 
            'all_columns': ['platform', 'mobile_os', 'service', 'service_metadata', 'service_metadata_version', 'account', 'num_requests', 'dt']})
db_fields_1


In [None]:
db_fields_2 = pd.DataFrame({'db_table': 'mapbox_customer_data.styles', 
            'all_columns': ['id', 'owner', 'metadata', 'sources']})
db_fields_2


In [None]:
db_fields_3 = pd.DataFrame({'db_table': 'sfdc.accounts', 
            'all_columns': ['dt', 'customer_tier_c', 'csm_c', 'name', 'mapbox_username_c', 'x18_digit_account_id_c']})
db_fields_3


In [None]:
db_fields_4 = pd.DataFrame({'db_table': 'sfdc.users', 
            'all_columns': ['dt', 'name', 'id']})
db_fields_4


In [None]:
df = db_fields_1.append(db_fields_2, ignore_index=True)

In [None]:
df = df.append(db_fields_3, ignore_index=True)
df = df.append(db_fields_4, ignore_index=True)

In [None]:
db_fields = df

In [None]:
db_fields

## given query

In [14]:
query = """"SELECT u.name,\n       b.customer_tier_c,\n       b.name,\n       m.account,\n       b.x18_digit_account_id_c,\n       s.id,\n       m.platform,\n       m.mobile_os,\n       m.num_requests,\n       Row_number() OVER(PARTITION BY s.id) row_\nFROM wbr.map_requests_by_account m\nINNER JOIN\n  (SELECT DISTINCT id\n   FROM mapbox_customer_data.styles\n   WHERE cast(dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n     AND sources LIKE '%mapbox-streets-v7%' ) s ON m.service_metadata_version = s.id\nLEFT JOIN\n  (SELECT customer_tier_c,\n          csm_c,\n          name,\n          mapbox_username_c,\n          x18_digit_account_id_c\n   FROM sfdc.accounts\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) b ON m.account = b.mapbox_username_c\nLEFT JOIN\n  (SELECT name,\n          id\n   FROM sfdc.users\n   WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ) u ON b.csm_c = u.id\nWHERE cast(m.dt AS DATE) >= CURRENT_DATE - INTERVAL '14' DAY\n  AND m.service_metadata = 'custom'\n  AND m.service = 'styles'\n  AND b.customer_tier_c IN ('Tier 0',\n                            'Tier 1',\n                            'Tier 2',\n                            'Tier 3',\n                            'Tier 4')"
"""

## transformation 1: format query

In [None]:
query = """SELECT api.name, acct.customer_tier_c, acct.name FROM api_requests_by_account api
LEFT JOIN accounts 
acct ON api.user_id = acct.customer_api_id
"""

In [None]:
formatter = column_parser.Parser(query)

In [None]:
formatted = formatter.format_query(query)
print(formatted)

## transformation 2: separate CTE's

In [11]:
query = """WITH a AS
  (SELECT DISTINCT anonymous_id,
                   user_id
   FROM customer_data.segment_identifies
   WHERE dt >= '2018-07-01'),
     b AS
  (SELECT id,
          email,
          created
   FROM customer_data.accounts)
SELECT a.*,
       b.*
FROM a
LEFT JOIN b ON a.user_id = b.id
WHERE context_campaign_name IS NOT NULL
"""

In [3]:
formatter = column_parser.Parser(query)
formatted = formatter.format_query(query)
print(formatted)

WITH a AS
  (SELECT DISTINCT anonymous_id,
                   user_id
   FROM customer_data.segment_identifies
   WHERE dt >= '2018-07-01'),
     b AS
  (SELECT id,
          email,
          created
   FROM customer_data.accounts)
SELECT a.*,
       b.*
FROM a
LEFT JOIN b ON a.user_id = b.id
WHERE context_campaign_name IS NOT NULL


In [6]:
formatter = column_parser.Parser(query)
cte_query = formatter.parse_cte(query)
cte_query

{'a': "SELECT DISTINCT anonymous_id,\n                   user_id\n   FROM customer_data.segment_identifies\n   WHERE dt >= '2018-07-01'",
 'b': 'SELECT id,\n          email,\n          created\n   FROM customer_data.accounts',
 'main_query': 'SELECT a.*,\n       b.*\nFROM a\nLEFT JOIN b ON a.user_id = b.id\nWHERE context_campaign_name IS NOT NULL\n'}

In [5]:
cte_query.keys()

dict_keys(['a', 'b', 'main_query'])

In [None]:
formatted = formatter.format_query(cte_query['main'])
cte_query = formatter.parse_cte(formatted)
cte_query

In [None]:
formatted = formatter.format_query(cte_query['main'])
cte_query = formatter.parse_cte(formatted)
cte_query

## transformation 3: match table aliases

In [27]:
query = """SELECT *
FROM api_requests.requests_by_account m
INNER JOIN mapbox_customer_data.styles s ON m.metadata_version = s.id
LEFT JOIN sfdc.users u ON m.csm = u.id
"""

In [28]:
formatter = column_parser.Parser(query)
formatted = formatter.format_query(query)

In [29]:
print(formatted)

SELECT *
FROM api_requests.requests_by_account m
INNER JOIN mapbox_customer_data.styles s ON m.metadata_version = s.id
LEFT JOIN sfdc.users u ON m.csm = u.id


In [30]:
table_alias_mapping = formatter.get_table_names(formatted.split('\n'))
table_alias_mapping

{'m': 'api_requests.requests_by_account',
 's': 'mapbox_customer_data.styles',
 'u': 'sfdc.users'}

## transformation 4: find columns

In [None]:
fields = formatter.match_queried_fields(query, db_fields)

In [None]:
pd.DataFrame(fields).sort_values(by=['database_name', 'table_name', 'column_name'])

## bonus transformation: upload other query metadata (such as timestamp, user)