# using sqlparse to create meta data

### instructions
- git pull latest

### limitations
- [solevd] when using spaces in tables names with reserved keyworkds, e.g. case in "tableau enc case view"
Have solved this by changing sq_metadata
- COPY TRUNCATE INSERTED not supported yet example (will only pick up CREATE and DROP): /csaa-aws-data-capability/nbn_dde_23/load/batches/d_location-rs_full_load.sql

### how does this work?
- sql files is made up of one to many sql statements
- each sql statments is made up of tokens
- tokens seperate out keyworkds, DML, DDL, names and puncutation ect.

### output
- parent child relationships of every single sql statement in every sql file (can be multiple)
- example:

In [576]:
# example output 
[{'fileName': 'Tableau MTM Glide Path Target View.sql',
  'name': 'Tableau MTM Glide Path Target View',
  'path': '/home/davidgriffiths/csaa-aws-data-capability/nbn_dde_23/tableau_views/Tableau MTM Glide Path Target View.sql',
  'depnd_tables': [],
  'parent': 'nbn_dde_23."Tableau MTM Glide Path Target View"',
  'ddl_type': 'DROP'}]

[{'fileName': 'Tableau MTM Glide Path Target View.sql',
  'name': 'Tableau MTM Glide Path Target View',
  'path': '/home/davidgriffiths/csaa-aws-data-capability/nbn_dde_23/tableau_views/Tableau MTM Glide Path Target View.sql',
  'depnd_tables': [],
  'parent': 'nbn_dde_23."Tableau MTM Glide Path Target View"',
  'ddl_type': 'DROP'}]

# setup

In [577]:
!conda info --envs

# conda environments:
#
base                     /home/davidgriffiths/miniconda3
sqlparse              *  /home/davidgriffiths/miniconda3/envs/sqlparse



In [578]:
import sql_metadata

In [579]:
import re
from typing import List, Tuple, Optional, Dict
import sqlparse

from sqlparse.sql import TokenList
from sqlparse.tokens import Name, Whitespace, Wildcard, Number, Punctuation, DDL 

# re built get_query_tokens
need to account for multiple queries in a file

In [580]:
def get_query_tokens(query: str) -> List[sqlparse.sql.Token]:
    """
    daves changes: sqlparse.parse(query) returns a tuple if multiple queries ";" need to handle this
    :type query str
    :rtype: list[{queryNoWithinFile str,  tokens: list[sqlparse.sql.Token]}
    """
    query = sql_metadata.preprocess_query(query)
    parsed = sqlparse.parse(query)

    # handle empty queries (#12)
    if not parsed:
        return []

    queries_list_and_tokens =[]
    n=0
    for qry in parsed:
        tokens = TokenList(qry.tokens).flatten()
        # print([(token.value, token.ttype) for token in tokens])

        dic = {"queryNoWithinFile":n,"tokens":[token for token in tokens if token.ttype is not Whitespace]}
        queries_list_and_tokens.append(dic)
        n = n+1
    return queries_list_and_tokens

# re built get_query_tables
i remvoed the quote removal

In [581]:
def get_query_tables(query: str, dev=False) -> List[str]:
    """
    :type query str
    :rtype: list[str]
    """
    

    table_syntax_keywords = [
        # SELECT queries
        'FROM', 'WHERE', 'JOIN', 'INNER JOIN', 'FULL JOIN', 'FULL OUTER JOIN',
        'LEFT OUTER JOIN', 'RIGHT OUTER JOIN',
        'LEFT JOIN', 'RIGHT JOIN', 'ON',
        # INSERT queries
        'INTO', 'VALUES',
        # UPDATE queries
        'UPDATE', 'SET',
        # Hive queries
        'TABLE',  # INSERT TABLE
        
    ]
    
#     print(sql_metadata.get_query_tokens(query))
#     query = query.replace('"', '')
    query = sqlparse.format(query, strip_comments=True).strip()
#     print(query)
    queries_list_and_tokens = get_query_tokens(query)
    
    queries_list_and_tokens_parsed =[]
    
    for query_and_tokens in queries_list_and_tokens:
        parent_table_list = []
        parent_table_counter = 0 # increments to 2 to get schema and table
        tables = []
        last_keyword = None
        last_ddl =None
        keep_last_ddl = None
        
        for index, token in enumerate(query_and_tokens['tokens']):
            if dev==True: print([token, token.ttype, last_keyword,last_ddl, token.value.upper(),parent_table_counter, last_ddl])
            if token.is_keyword and token.value.upper() in table_syntax_keywords:
                # keep the name of the last keyword, the next one can be a table name
                last_keyword = token.value.upper()
#                 print('keyword', last_keyword)
            elif str(token.ttype) == 'Token.Keyword.DDL':
                # keep the name of the last create, the next one can be annother table to create
                last_ddl = token.value.upper()
                keep_last_ddl= token.value.upper()
#                 print('keyword.lld', token.value.upper())
            elif str(token) == '(':
                # reset the last_keyword for INSERT `foo` VALUES(id, bar) ...
                last_keyword = None
            elif token.is_keyword and str(token) in ['FORCE', 'ORDER', 'GROUP BY']:
                # reset the last_keyword for queries like:
                # "SELECT x FORCE INDEX"
                # "SELECT x ORDER BY"
                # "SELECT x FROM y GROUP BY x"
                last_keyword = None
            elif token.is_keyword and str(token) == 'SELECT' and last_keyword in ['INTO', 'TABLE']:
                # reset the last_keyword for "INSERT INTO SELECT" and "INSERT TABLE SELECT" queries
                last_keyword = None
            elif (token.ttype is Name or str(token.ttype) == 'Token.Literal.String.Symbol') and last_ddl in ['CREATE','DROP']:
#                 print("found a CREATE or DROP")
                parent_table_list.append(str(token))
                parent_table_counter = parent_table_counter + 1
                if parent_table_counter > 1:
                    last_ddl = None
                    parent_table_counter = 0
#                 parent_table = sql_metadata._update_table_names(tables, query_and_tokens['tokens'], index, last_keyword)
            elif token.ttype is Name or token.ttype or token.is_keyword:
                tables = sql_metadata._update_table_names(tables, query_and_tokens['tokens'], index, last_keyword)
        unique_tables = sql_metadata.unique(tables)
        dic = {
            "queryNoWithinFile": query_and_tokens['queryNoWithinFile'],
            "tables": unique_tables,
            "parent_table": '.'.join(parent_table_list),
            "ddl_type": keep_last_ddl
        }
        queries_list_and_tokens_parsed.append(dic)

    return queries_list_and_tokens_parsed

# working with multiple files

In [582]:
import os
from os import path
from pathlib import Path

# get all sql files

In [583]:
files = os.listdir()

In [584]:
files_filtered = list(filter(lambda x: re.search(r"^.*\.(sql)$", x), files))

# get all sql view depedncies

### get all files in RAPID

In [585]:
rapid_dirs = ['nbn_dde_23/load/batches'] # ['nbn_dde_23/tableau_views'] # ['tableau_dde/transform/views'] # ['nbn_dde_23/tableau_views','tableau_dde/transform/views']

In [586]:
rapid_files = []
for rapid_dir in rapid_dirs:
    for dirpath, dirname, filenames in os.walk(os.path.join('/home/davidgriffiths/csaa-aws-data-capability',rapid_dir)):
    #     print('Current path:', dirpath)
    #     print('Directories:', dirname)
    #     print('Files:', filenames)
        for file in filenames:
            file_dict = {
                "fileName": file,
                "path": os.path.join(dirpath,file),
            }
            rapid_files.append(file_dict)
rapid_files[0:3]

[{'fileName': 'd_location-rs_full_load.sql',
  'path': '/home/davidgriffiths/csaa-aws-data-capability/nbn_dde_23/load/batches/d_location-rs_full_load.sql'},
 {'fileName': 'ful_nhur_v-rs_delta_load.sql',
  'path': '/home/davidgriffiths/csaa-aws-data-capability/nbn_dde_23/load/batches/ful_nhur_v-rs_delta_load.sql'},
 {'fileName': 'td_initial_aged_incident_snapshot-rs_full_load.sql',
  'path': '/home/davidgriffiths/csaa-aws-data-capability/nbn_dde_23/load/batches/td_initial_aged_incident_snapshot-rs_full_load.sql'}]

### keep only .sql files

In [587]:
rapid_files_filtered = list(filter(lambda x: re.search(r"^.*\.(sql)$", x['fileName']), rapid_files))
rapid_files_filtered[0:5]

[{'fileName': 'd_location-rs_full_load.sql',
  'path': '/home/davidgriffiths/csaa-aws-data-capability/nbn_dde_23/load/batches/d_location-rs_full_load.sql'},
 {'fileName': 'ful_nhur_v-rs_delta_load.sql',
  'path': '/home/davidgriffiths/csaa-aws-data-capability/nbn_dde_23/load/batches/ful_nhur_v-rs_delta_load.sql'},
 {'fileName': 'td_initial_aged_incident_snapshot-rs_full_load.sql',
  'path': '/home/davidgriffiths/csaa-aws-data-capability/nbn_dde_23/load/batches/td_initial_aged_incident_snapshot-rs_full_load.sql'},
 {'fileName': 'cc_hwm_max_nbnquotacapview_v-rs_delta_load.sql',
  'path': '/home/davidgriffiths/csaa-aws-data-capability/nbn_dde_23/load/batches/cc_hwm_max_nbnquotacapview_v-rs_delta_load.sql'},
 {'fileName': 'a_ful_order_item_v-rs_delta_load.sql',
  'path': '/home/davidgriffiths/csaa-aws-data-capability/nbn_dde_23/load/batches/a_ful_order_item_v-rs_delta_load.sql'}]

## loop through files

In [588]:
list_of_rapid_sql_views = []

for file in rapid_files_filtered:
    with open(file['path'], 'r') as f:
        query = f.read()
        queries_and_tables = get_query_tables(query,False)
    
        for qry in queries_and_tables:
            
            dic = {
                'fileName': file['fileName'],
                'name': os.path.splitext(file['fileName'])[0],
                'path': file['path'].replace("/home/davidgriffiths", ""),
                'depnd_tables': qry['tables'],
                'parent': qry["parent_table"],
                'ddl_type': qry['ddl_type']
            }

            list_of_rapid_sql_views.append(dic) 
        

list_of_rapid_sql_views[0:5]

[{'fileName': 'd_location-rs_full_load.sql',
  'name': 'd_location-rs_full_load',
  'path': '/csaa-aws-data-capability/nbn_dde_23/load/batches/d_location-rs_full_load.sql',
  'depnd_tables': ['nbn_dde_23.IMPT_D_LOCATION_T'],
  'parent': '',
  'ddl_type': None},
 {'fileName': 'd_location-rs_full_load.sql',
  'name': 'd_location-rs_full_load',
  'path': '/csaa-aws-data-capability/nbn_dde_23/load/batches/d_location-rs_full_load.sql',
  'depnd_tables': ["'s3://csaa-redshift-ingest-pr/historical/d_location.manifest'"],
  'parent': '',
  'ddl_type': None},
 {'fileName': 'd_location-rs_full_load.sql',
  'name': 'd_location-rs_full_load',
  'path': '/csaa-aws-data-capability/nbn_dde_23/load/batches/d_location-rs_full_load.sql',
  'depnd_tables': ['nbn_dde_23.IMPT_D_LOCATION_T'],
  'parent': '',
  'ddl_type': None},
 {'fileName': 'd_location-rs_full_load.sql',
  'name': 'd_location-rs_full_load',
  'path': '/csaa-aws-data-capability/nbn_dde_23/load/batches/d_location-rs_full_load.sql',
  'depnd

# write out as JSON

In [589]:
import json

In [590]:
# # loads json from string
# datajsonstr=json.loads('{"str":"str"}')
# datajsonstr

In [591]:
# stringify = json.dumps(list_of_rapid_sql_views, indent=2)
# stringify

In [592]:
# with open('jsonfile.json') as f:
#     data = json.load(f)

In [593]:
with open('list_of_rapid_sql_views.json','w') as f:
    json.dump(list_of_rapid_sql_views, f)

# test out funny files

In [594]:
filenamedave = '/home/davidgriffiths/csaa-aws-data-capability/nbn_dde_23/tableau_views/Tableau Incident WO Links View.sql'

In [595]:
with open(filenamedave, 'r') as f:
        query = f.read()
#         print(query)
        tables = get_query_tables(query)
        print(tables)

[{'queryNoWithinFile': 0, 'tables': [], 'parent_table': 'nbn_dde_23."Tableau Incident WO Links View"', 'ddl_type': 'DROP'}, {'queryNoWithinFile': 1, 'tables': ['nbn_dde_23.CSA_INCIDENT_LINKS_WO_T'], 'parent_table': '', 'ddl_type': 'CREATE  OR REPLACE'}]


# problem sqlparse.parse doesnt do multiple queries

In [596]:
with open(filenamedave, 'r') as f:
        query = f.read()
#         print(query)
#         tables = get_query_tables(query)
#         print(tables)

        # is parsing within sql_metadata.get_query_tokens working
#         res = sqlparse.parse(query)
#         print(res[1]) ##  this second query isnt getting picked up
        
        print(get_query_tokens(query))
        
        

[{'queryNoWithinFile': 0, 'tokens': [<DDL 'drop' at 0x7F7FDDAFCF48>, <Keyword 'view' at 0x7F7FDDAF2108>, <Keyword 'if' at 0x7F7FDDAF21C8>, <Keyword 'exists' at 0x7F7FDDAF2288>, <Name 'nbn_dd...' at 0x7F7FDDAF2348>, <Punctuation '.' at 0x7F7FDDAF23A8>, <Symbol '"Table...' at 0x7F7FDDAF2408>, <Punctuation ';' at 0x7F7FDDAF2468>]}, {'queryNoWithinFile': 1, 'tokens': [<DDL 'CREATE...' at 0x7F7FDDAF2588>, <Keyword 'VIEW' at 0x7F7FDDAF2648>, <Name 'nbn_dd...' at 0x7F7FDDAF2708>, <Punctuation '.' at 0x7F7FDDAF2768>, <Symbol '"Table...' at 0x7F7FDDAF27C8>, <Keyword 'AS' at 0x7F7FDDAF2888>, <DML 'SELECT' at 0x7F7FDDAF2948>, <Name 'INC_WO...' at 0x7F7FDDAF2A08>, <Punctuation ',' at 0x7F7FDDAF2D68>, <Name 'INCIDE...' at 0x7F7FDDAF2E28>, <Punctuation ',' at 0x7F7FDDAF41C8>, <Name 'APPOIN...' at 0x7F7FDDAF4288>, <Punctuation ',' at 0x7F7FDDAF45E8>, <Name 'WORK_R...' at 0x7F7FDDAF46A8>, <Punctuation ',' at 0x7F7FDDAF4A08>, <Name 'WORK_O...' at 0x7F7FDDAF4AC8>, <Punctuation ',' at 0x7F7FDDAF4E28>, <N