In [1]:
import json
import pandas as pd
from sqlanalyzer import column_parser

In [2]:
with open('./sqlanalyzer/data.json', 'r') as f:
    query_dict = json.load(f)

In [3]:
print(json.dumps(query_dict, indent=2), '\n\n\n')

{
  "no alias": {
    "a": "SELECT DISTINCT anonymous_id,                    user_id    FROM mapbox_customer_data.segment_identifies    WHERE dt >= '2018-07-01'      AND anonymous_id IS NOT NULL AND user_id IS NOT NULL ",
    "b": "SELECT id,           email,           created    FROM mapbox_customer_data.accounts WHERE cast(dt AS DATE) = CURRENT_DATE - INTERVAL '1' DAY ",
    "c": "SELECT anonymous_id AS anon_id_ad,           context_campaign_name,           min(TIMESTAMP) AS min_exposure    FROM mapbox_customer_data.segment_pages    WHERE dt >= '2018-07-01'      AND context_campaign_name IS NOT NULL    GROUP BY 1, 2",
    "d": "SELECT DISTINCT anonymous_id AS anon_id_event,                    original_timestamp,                    event,                    context_traits_email    FROM mapbox_customer_data.segment_tracks    WHERE dt >= '2018-07-01'      AND event LIKE 'submitted_%form' AND context_traits_email IS NOT NULL ",
    "e": "SELECT sfdc_accounts.platform,           sfdc_acco

## metastore 

In [4]:
db_fields_1 = pd.DataFrame({'db_table': 'mapbox_customer_data.segment_identifies', 
            'all_columns': ['anonymous_id', 'user_id', 'service', 'service_metadata', 'service_metadata_version', 'account', 'num_requests', 'dt']})


In [5]:
db_fields_2 = pd.DataFrame({'db_table': 'mapbox_customer_data.accounts', 
            'all_columns': ['id', 'user_id', 'email', 'created', 'service_metadata_version', 'account', 'num_requests', 'dt']})


In [6]:
db_fields_3 = pd.DataFrame({'db_table': 'mapbox_customer_data.segment_pages', 
            'all_columns': ['anonymous_id', 'context_campaign_name', 'service', 'service_metadata', 'service_metadata_version', 'account', 'num_requests', 'dt']})


In [7]:
db_fields_4 = pd.DataFrame({'db_table': 'mapbox_customer_data.segment_tracks', 
            'all_columns': ['anonymous_id', 'original_timestamp', 'event', 'context_traits_email', 'service_metadata_version', 'account', 'num_requests', 'dt']})


In [8]:
db_fields_5 = pd.DataFrame({'db_table': 'sfdc.cases', 
            'all_columns': ['account', 'num_requests', 'owner', 'anonymous_id', 'id', 'original_timestamp', 'event', 'context_traits_email', 'service_metadata_version', 'dt']})


In [9]:
db_fields_6 = pd.DataFrame({'db_table': 'sfdc.owner',
                           'all_columns': ['dt', 'first_name', 'last_name']})


In [10]:
db_fields_7 = pd.DataFrame({'db_table': 'sfdc.accounts',
                           'all_columns': ['platform', 'case_id', 'mobile_os', 'service_metadata', 'user_id', 'first_name', 'last_name']})


In [11]:
df = db_fields_1.append(db_fields_2, ignore_index=True)
df = df.append(db_fields_3, ignore_index=True)
df = df.append(db_fields_4, ignore_index=True)
df = df.append(db_fields_5, ignore_index=True)
df = df.append(db_fields_6, ignore_index=True)
df = df.append(db_fields_7, ignore_index=True)
df

Unnamed: 0,db_table,all_columns
0,mapbox_customer_data.segment_identifies,anonymous_id
1,mapbox_customer_data.segment_identifies,user_id
2,mapbox_customer_data.segment_identifies,service
3,mapbox_customer_data.segment_identifies,service_metadata
4,mapbox_customer_data.segment_identifies,service_metadata_version
5,mapbox_customer_data.segment_identifies,account
6,mapbox_customer_data.segment_identifies,num_requests
7,mapbox_customer_data.segment_identifies,dt
8,mapbox_customer_data.accounts,id
9,mapbox_customer_data.accounts,user_id


In [12]:
def extract_subquery_fields(query, db_fields):
    formatter = column_parser.Parser(query)
    formatted = formatter.format_query(query)
    fields = formatter.match_queried_fields(formatted, db_fields)
    return fields


In [13]:
def compile_queried_cols(query_dict):
    all_cols = []
    for k,v in query_dict.items():
        if isinstance(v, dict):
            for k1,v1 in v.items():
                all_cols.extend(extract_subquery_fields(v1, df))
        else:
            all_cols.extend(extract_subquery_fields(v, df))
    return all_cols


In [14]:
compile_queried_cols(query_dict)

[{'database_name': 'mapbox_customer_data',
  'table_name': 'segment_identifies',
  'column_name': 'anonymous_id'},
 {'database_name': 'mapbox_customer_data',
  'table_name': 'segment_identifies',
  'column_name': 'dt'},
 {'database_name': 'mapbox_customer_data',
  'table_name': 'segment_identifies',
  'column_name': 'user_id'},
 {'database_name': 'mapbox_customer_data',
  'table_name': 'accounts',
  'column_name': 'dt'},
 {'database_name': 'mapbox_customer_data',
  'table_name': 'accounts',
  'column_name': 'created'},
 {'database_name': 'mapbox_customer_data',
  'table_name': 'accounts',
  'column_name': 'email'},
 {'database_name': 'mapbox_customer_data',
  'table_name': 'accounts',
  'column_name': 'id'},
 {'database_name': 'mapbox_customer_data',
  'table_name': 'segment_pages',
  'column_name': 'dt'},
 {'database_name': 'mapbox_customer_data',
  'table_name': 'segment_pages',
  'column_name': 'context_campaign_name'},
 {'database_name': 'mapbox_customer_data',
  'table_name': 'seg