In [31]:
import json
import pandas as pd
from sqlanalyzer import column_parser

In [61]:
query = open('active_devs.sql').read()
formatter = column_parser.Parser(query)
formatted_query = formatter.format_query(query)
cte_queries = formatter.parse_cte(formatted_query)


In [64]:
print(formatted_query)

WITH analytics_service_endpoint_mapping AS
  (SELECT b.*
   FROM
     (SELECT MAX(dt) AS dt
      FROM analytics.service_endpoint_mapping) a
   INNER JOIN analytics.service_endpoint_mapping b ON a.dt = b.dt),
     web_dev AS
  (SELECT s.dt,
          COALESCE(em.platform, 'unknown') AS platform,
          COALESCE(service_org, 'other') AS service,
          account
   FROM sku.daily_by_account s
   INNER JOIN mapbox_customer_data.accounts a ON s.account = a.id
   AND a.dt = '{run_date}'
   LEFT JOIN analytics_service_endpoint_mapping em ON s.sku_id = em.sku_id
   AND em.in_sku IS NOT NULL
   AND em.parent_sku IS NULL
   WHERE s.dt BETWEEN DATE_SUB('{run_date}', 29) AND '{run_date}'
   GROUP BY 1,
            2,
            3,
            4),
     mobile_dev AS
  (SELECT dt,
          'mobile' AS platform,
          CASE
              WHEN LOWER(useragent) RLIKE '(mapboxeventsnavigation|navigation-)' THEN 'navigation'
              WHEN LOWER(useragent) RLIKE '(mapboxeventsunity|mapbox-

In [15]:
with open('./data.json', 'r') as f:
    query_dict = json.load(f)

In [16]:
print(json.dumps(query_dict, indent=2), '\n\n\n')

{
  "analytics_service_endpoint_mapping": {
    "b": {
      "a": "analytics.service_endpoint_mapping",
      "analytics.service_endpoint_mapping": "INNER JOIN",
      "main": "SELECT MAX(dt) AS dt\nFROM analytics.service_endpoint_mapping) a\nINNER JOIN analytics.service_endpoint_mapping "
    },
    "main": "SELECT b.*\nFROM b ON a.dt = b.dt),"
  },
  "web_dev": {
    "s": "sku.daily_by_account",
    "main": "SELECT s.dt,\n       COALESCE(em.platform, 'unknown') AS platform,\n       COALESCE(service_org, 'other') AS service,\n       account\nFROM sku.daily_by_account s\nINNER JOIN mapbox_customer_data.accounts a ON s.account = a.id\nAND a.dt = '{run_date}'\nLEFT JOIN analytics_service_endpoint_mapping em ON s.sku_id = em.sku_id\nAND em.in_sku IS NOT NULL\nAND em.parent_sku IS NULL\nWHERE s.dt BETWEEN DATE_SUB('{run_date}', 29) AND '{run_date}'\nGROUP BY 1,\n         2,\n         3,\n         4), "
  },
  "mobile_dev": {
    "main": "SELECT dt,\n       'mobile' AS platform,\n       CAS

In [17]:
query_dict.keys()

dict_keys(['analytics_service_endpoint_mapping', 'web_dev', 'mobile_dev', 'studio_dev', 'web_mobile_studio', 'mau_cube', 'wau_cube', 'dau_cube', 'main'])

## metastore 

In [18]:
db_fields_1 = pd.DataFrame({'db_table': 'analytics.service_endpoint_mapping', 
            'all_columns': ['parent_sku',
 'sku_id',
 'service_org',
 'platform',
 'is_in_cf',
 'is_in_stats',
 'is_in_stripe',
 'in_sku',
 'is_exempt',
 'is_sfdc_mapped',
 'is_billable',
 'divisor_mapviews',
 'dt']})


In [19]:
db_fields_2 = pd.DataFrame({'db_table': 'sku.daily_by_account', 
            'all_columns': ['account',
 'sku_id',
 'subunits',
 'dynamo_hash',
 'dynamo_range',
 'received',
 'dt']})


In [20]:
db_fields_3 = pd.DataFrame({'db_table': 'mapbox_customer_data.accounts', 
            'all_columns': ['id',
 'accountlevel',
 'created',
 'flags',
 'collection',
 'resetmod',
 'modified',
 'passmod',
 'email',
 'lastlogin',
 'extrastorage',
 'extratm2z',
 'customerid',
 'website',
 'flags_v3allow',
 'description',
 'storage',
 'alerted',
 'name',
 'chargifycustomerid',
 'flags_patch',
 'flags_directions',
 'flags_surface',
 'flags_apigl',
 'flags_tm2z',
 'flags_datasets',
 'flags_geocoder_permanent',
 'flags_token_resources',
 'flags_order',
 'confirmed',
 'mfakey',
 'flags_styles',
 'flags_rawdata',
 'flags_extrastorage',
 'contact',
 'destroyed',
 'chargifysubscriptionid',
 'atlas',
 'group',
 'limits_stylecount',
 'billingemail',
 'limits_geocodingrate',
 'billingname',
 'limits',
 'url',
 'limits_stylesrate',
 'limits_gltilerate',
 'mfarecoverycode',
 'datasetstorage',
 'received_at',
 'limits_datasetsrate',
 'rownumber',
 'limits_mapsrate',
 'flags_hide_improve_map',
 'flags_whitelabel_map',
 'limits_uploadsrate',
 'limits_optimizedtripsrate',
 'limits_glstaticrate',
 'limits_tokensrate',
 'limits_directionsrate',
 'limits_mapmatchingrate',
 'limits_directionstrafficrate',
 'speedcheck_s',
 'limits_directionscoordinatescount',
 'limits_directionsmatrixcoordinatescount',
 'limits_mapmatchingcoordinatescount',
 'limits_mapmatchingtrafficrate',
 'limits_directionstrafficcoordinatescount',
 'limits_distancerate',
 'flags_enterprise_boundaries',
 'limits_optimizedtripscoordinatescount',
 'limits_matrixtrafficcoordinatescount',
 'limits_matrixtrafficrate',
 'limits_matrixrate',
 'limits_matrixcoordinatescount',
 'limits_customerswriterate',
 'limits_directionstilesrate',
 'flags_high_res_print',
 'limits_datasetswriterate',
 'contact_test',
 'contact_tset',
 'atlasv2_expires',
 'flags_atlas_beta',
 'atlasv2_features',
 'flags_sso_private_beta',
 'ssoissuer',
 'subscriptionskus_essentialsupport_quantity',
 'dt']})


In [21]:
db_fields_4 = pd.DataFrame({'db_table': 'sdk_events.appuserturnstile', 
            'all_columns': ['event',
 'created',
 'userid',
 'enabled_telemetry',
 'received',
 'token',
 'owner',
 'authorization',
 'useragent',
 'version',
 'createdoffset',
 'device',
 'sdkversion',
 'operatingsystem',
 'sdkidentifier',
 'receivedat',
 'devicetimestamp',
 'vendorid',
 'ip',
 'skuid',
 'locationauthorization',
 'locationenabled',
 'dt',
 'hr']})


In [22]:
db_fields_5 = pd.DataFrame({'db_table': 'mapbox_customer_data.segment_tracks',
                           'all_columns': ['id',
 'run_id',
 'anonymous_id',
 'context_campaign_content',
 'context_campaign_expid',
 'context_campaign_medium',
 'context_campaign_name',
 'context_campaign_referrer',
 'context_campaign_source',
 'context_campaign_swu',
 'context_campaign_term',
 'context_ip',
 'context_library_name',
 'context_library_version',
 'context_page_path',
 'context_page_referrer',
 'context_page_search',
 'context_page_title',
 'context_page_url',
 'context_user_agent',
 'event',
 'event_text',
 'original_timestamp',
 'received_at',
 'sent_at',
 'user_id',
 'timestamp',
 'context_campaign_utm',
 'context_referrer_id',
 'context_referrer_type',
 'context_campaign_utm_campaign',
 'context_campaign_utm_content',
 'context_campaign_utm_medium',
 'context_campaign_id',
 'context_campaign_c',
 'uuid_ts',
 'context_integration_name',
 'context_integration_version',
 'context_traits_email',
 'context_traits_avatar',
 'context_traits_created_at',
 'context_traits_name',
 'context_traits_plan',
 'context_quantcast',
 'context_campaign_thumbnail',
 'context_campaign_title',
 'context_campaign_source2',
 'context_campaign_nooverride',
 'context_campaign_offer',
 'context_campaign_soure',
 'context_campaign_account',
 'context_campaign_email',
 'context_campaign_item',
 'context_campaign_keyword',
 'context_campaign_rec',
 'context_campaign_campfxaign',
 'context_campaign_admin',
 'context_campaign_jobid',
 'context_campaign_send',
 'context_campaign_targeting',
 'context_campaign_widget_id',
 'context_campaign_boost_id',
 'context_campaign_content_id',
 'context_campaign_member',
 'context_campaign_c1ampaign',
 'context_campaign_oi',
 'context_amp_id',
 'context_campaign_medum',
 'context_track_style_meta',
 'context_locale',
 'dt']})


In [23]:
db_fields_6 = pd.DataFrame({'db_table': 'logs.cloudfront_logs_china_to_global_proxy_30_days',
                           'all_columns': ['logdate',
 'logtime',
 'edge',
 'bytessent',
 'cip',
 'method',
 'host',
 'uri',
 'status',
 'creferrer',
 'useragent',
 'cs_uri_query',
 'cookie',
 'x_edge_result_type',
 'x_edge_request_id',
 'x_host_header',
 'protocol',
 'cs_bytes',
 'time_taken',
 'x_forwarded_for',
 'ssl_protocol',
 'ssl_cipher',
 'x_edge_response_result_type',
 'cs_protocol_version',
 'service',
 'api_version',
 'account',
 'token',
 'agent',
 'country',
 'resource',
 'query',
 'sku_token',
 'sku_id',
 'dt',
 'hr',
 'load_dt']})


In [24]:
df = db_fields_1.append(db_fields_2, ignore_index=True)
df = df.append(db_fields_3, ignore_index=True)
df = df.append(db_fields_4, ignore_index=True)
df = df.append(db_fields_5, ignore_index=True)
df = df.append(db_fields_6, ignore_index=True)
df

Unnamed: 0,db_table,all_columns
0,analytics.service_endpoint_mapping,parent_sku
1,analytics.service_endpoint_mapping,sku_id
2,analytics.service_endpoint_mapping,service_org
3,analytics.service_endpoint_mapping,platform
4,analytics.service_endpoint_mapping,is_in_cf
...,...,...
235,logs.cloudfront_logs_china_to_global_proxy_30_...,sku_token
236,logs.cloudfront_logs_china_to_global_proxy_30_...,sku_id
237,logs.cloudfront_logs_china_to_global_proxy_30_...,dt
238,logs.cloudfront_logs_china_to_global_proxy_30_...,hr


In [25]:
def extract_subquery_fields(query, db_fields):
    formatter = column_parser.Parser(query)
    formatted = formatter.format_query(query)
    fields = formatter.match_queried_fields(formatted, db_fields)
    return fields


In [26]:
def compile_queried_cols(query_dict):
    all_cols = []
    for k,v in query_dict.items():
        if isinstance(v, dict):
            for k1,v1 in v.items():
                all_cols.extend(extract_subquery_fields(v1, df))
        else:
            all_cols.extend(extract_subquery_fields(v, df))
    return all_cols


In [27]:
for k,v in query_dict.items():
    if isinstance(v, dict):
        print(k, '\n', compile_queried_cols(v), '\n\n')
    else:
        print(k, '\n', extract_subquery_fields(v, df), '\n\n')

analytics_service_endpoint_mapping 
 [{'database_name': 'analytics', 'table_name': 'service_endpoint_mapping', 'column_name': 'dt'}] 


web_dev 
 [{'database_name': 'sku', 'table_name': 'daily_by_account', 'column_name': 'dt'}, {'database_name': 'sku', 'table_name': 'daily_by_account', 'column_name': 'account'}, {'database_name': 'sku', 'table_name': 'daily_by_account', 'column_name': 'sku_id'}, {'database_name': 'mapbox_customer_data', 'table_name': 'accounts', 'column_name': 'dt'}, {'database_name': 'mapbox_customer_data', 'table_name': 'accounts', 'column_name': 'id'}] 


mobile_dev 
 [{'database_name': 'sdk_events', 'table_name': 'appuserturnstile', 'column_name': 'sdkidentifier'}, {'database_name': 'sdk_events', 'table_name': 'appuserturnstile', 'column_name': 'useragent'}, {'database_name': 'sdk_events', 'table_name': 'appuserturnstile', 'column_name': 'dt'}] 


studio_dev 
 [{'database_name': 'mapbox_customer_data', 'table_name': 'segment_tracks', 'column_name': 'dt'}, {'databas

In [29]:
v = query_dict['analytics_service_endpoint_mapping']

In [30]:
compile_queried_cols(v)

[{'database_name': 'analytics',
  'table_name': 'service_endpoint_mapping',
  'column_name': 'dt'}]