In [1]:
!pip install sql-metadata



0) sql_metadata
https://pypi.org/project/sql-metadata/
https://github.com/macbre/sql-metadata

1) re
https://grisha.org/blog/2016/11/14/table-names-from-sql/

2) sqlparse
https://github.com/andialbrecht/sqlparse/blob/master/examples/extract_table_names.py

3) stackoverflow
https://stackoverflow.com/questions/60822203/how-to-parse-any-sql-get-columns-names-and-table-name-using-sql-parser-in-python

In [20]:
# This allows multiple outputs from a single jupyter notebook cell:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [4]:
from sql_metadata import Parser

# extract raw sql-metadata tokens
tokens = Parser("SELECT * FROM foo").tokens
# ['SELECT', '*', 'FROM', 'foo']

In [5]:
len(tokens)

4

In [6]:
tokens[0]

SQLToken(position=0,value=SELECT,is_keyword=True,is_name=False,is_punctuation=False,is_dot=False,is_wildcard=False,is_integer=False,is_float=False,is_comment=False,is_as_keyword=False,is_left_parenthesis=False,is_right_parenthesis=False,last_keyword=None,next_token=*,previous_token=,subquery_level=0,token_type=None,is_in_nested_function=False,parenthesis_level=0,is_subquery_start=False,is_subquery_end=False,is_with_query_start=False,is_with_query_end=False,is_with_columns_start=False,is_with_columns_end=False,is_nested_function_start=False,is_nested_function_end=False,is_column_definition_start=False,is_column_definition_end=False,is_create_table_columns_declaration_start=False,is_create_table_columns_declaration_end=False,is_partition_clause_start=False,is_partition_clause_end=False)

In [7]:
[(t.value, t.is_keyword) for t in tokens]

[('SELECT', True), ('*', False), ('FROM', True), ('foo', False)]

## Extracting columns/alias from query

In [8]:
# get columns from query - for more examples see `tests/test_getting_columns.py`
Parser("SELECT test, id FROM foo, bar").columns
# ['test', 'id']

['test', 'id']

In [9]:
Parser("INSERT /* VoteHelper::addVote xxx */  INTO `page_vote` (article_id,user_id,`time`) VALUES ('442001','27574631','20180228130846')").columns
# ['article_id', 'user_id', 'time']

['article_id', 'user_id', 'time']

In [10]:
parser = Parser("SELECT a.* FROM product_a.users AS a JOIN product_b.users AS b ON a.ip_address = b.ip_address")

# note that aliases are auto-resolved
parser.columns
# ['product_a.*', 'product_a.users.ip_address', 'product_b.users.ip_address']

['product_a.users.*',
 'product_a.users.ip_address',
 'product_b.users.ip_address']

In [11]:
# note that you can also extract columns with their place in the query
# which will return dict with lists divided into select, where, order_by, group_by, join, insert and update
parser.columns_dict
# {'select': ['product_a.users.*'], 'join': ['product_a.users.ip_address', 'product_b.users.ip_address']}

{'select': ['product_a.users.*'],
 'join': ['product_a.users.ip_address', 'product_b.users.ip_address']}

In [12]:
parser = Parser("SELECT a, (b + c - u) as alias1, custome_func(d) alias2 from aa, bb order by alias1")

# note that columns list do not contain aliases of the columns
parser.columns
# ["a", "b", "c", "u", "d"]

['a', 'b', 'c', 'u', 'd']

In [13]:
# but you can still extract aliases names
parser.columns_aliases_names
# ["alias1", "alias2"]

['alias1', 'alias2']

In [14]:
# aliases are resolved to the columns which they refer to
parser.columns_aliases
# {"alias1": ["b", "c", "u"], "alias2": "d"}

{'alias1': ['b', 'c', 'u'], 'alias2': 'd'}

In [15]:
# you can also extract aliases used by section of the query in which they are used
parser.columns_aliases_dict
# {"order_by": ["alias1"], "select": ["alias1", "alias2"]}

{'select': ['alias1', 'alias2'], 'order_by': ['alias1']}

In [16]:
# the same applies to aliases used in queries section when you extract columns_dict
# here only the alias is used in order by but it's resolved to actual columns
assert parser.columns_dict == {'order_by': ['b', 'c', 'u'],
                               'select': ['a', 'b', 'c', 'u', 'd']}

## Extracting tables from query

In [17]:
# get tables from query - for more examples see `tests/test_getting_tables.py`
Parser("SELECT a.* FROM product_a.users AS a JOIN product_b.users AS b ON a.ip_address = b.ip_address").tables
# ['product_a.users', 'product_b.users']

['product_a.users', 'product_b.users']

In [18]:
Parser("SELECT test, id FROM foo, bar").tables
# ['foo', 'bar']

['foo', 'bar']

In [21]:
# you can also extract aliases of the tables as a dictionary
parser = Parser("SELECT f.test FROM foo AS f")

# get table aliases
parser.tables_aliases
# {'f': 'foo'}

# note that aliases are auto-resolved for columns
parser.columns
# ["foo.test"]

{'f': 'foo'}

['foo.test']

## Extracting values from insert query

In [24]:
parser = Parser(
    "INSERT /* VoteHelper::addVote xxx */  INTO `db_test.page_vote` (article_id,user_id,`time`) " 
    "VALUES ('442001','27574631','20180228130846')"
)

In [25]:
parser.tables

['db_test.page_vote']

In [26]:
# extract values from query
parser.values
# ["442001", "27574631", "20180228130846"]

# extract a dictionary with column-value pairs
parser.values_dict
#{"article_id": "442001", "user_id": "27574631", "time": "20180228130846"}

# if column names are not set auto-add placeholders
parser = Parser(
    "INSERT IGNORE INTO `table` VALUES (9, 2.15, '123', '2017-01-01');"
)

parser.tables

parser.values
# [9, 2.15, "123", "2017-01-01"]

parser.values_dict
#{"column_1": 9, "column_2": 2.15, "column_3": "123", "column_4": "2017-01-01"}

['442001', '27574631', '20180228130846']

{'article_id': '442001', 'user_id': '27574631', 'time': '20180228130846'}

['table']

[9, 2.15, '123', '2017-01-01']

{'column_1': 9, 'column_2': 2.15, 'column_3': '123', 'column_4': '2017-01-01'}

In [27]:
parser = Parser(
    "INSERT INTO `db.table` select (9, 2.15, '123', '2017-01-01') from dual;"
)

parser.tables

['db.table', 'dual']

## Extracting limit and offset

In [28]:
Parser('SELECT foo_limit FROM bar_offset LIMIT 50 OFFSET 1000').limit_and_offset
# (50, 1000)

Parser('SELECT foo_limit FROM bar_offset limit 2000,50').limit_and_offset
# (50, 2000)

(50, 1000)

(50, 2000)

## Extracting with names

In [29]:
parser = Parser(
    """
WITH
    database1.tableFromWith AS (SELECT aa.* FROM db2.table3 as aa 
                                left join table4 on aa.col1=table4.col2),
    test as (SELECT * from db3.table3)
SELECT
  "xxxxx"
FROM
  catalog.database1.tableFromWith alias
LEFT JOIN database2.table2 ON ("tt"."ttt"."fff" = "xx"."xxx")
"""
)

# get names/ aliases of with statements
parser.with_names
# ["database1.tableFromWith", "test"]

# get definition of with queries
parser.with_queries
# {"database1.tableFromWith": "SELECT aa.* FROM table3 as aa left join table4 on aa.col1=table4.col2"
#  "test": "SELECT * from table3"}

# note that names of with statements do not appear in tables
parser.tables
# ["table3", "table4", "database2.table2"]

['database1.tableFromWith', 'test']

{'database1.tableFromWith': 'SELECT aa.* FROM db2.table3 as aa left join table4 on aa.col1 = table4.col2',
 'test': 'SELECT * from db3.table3'}

['db2.table3',
 'table4',
 'db3.table3',
 'catalog.database1.tableFromWith',
 'database2.table2']

## Extracting sub-queries

In [30]:
parser = Parser(
"""
SELECT COUNT(1) FROM
(SELECT std.task_id FROM some_task_detail std WHERE std.STATUS = 1) a
JOIN (SELECT st.task_id FROM some_task st WHERE task_type_id = 80) b
ON a.task_id = b.task_id;
"""
)
parser.tables

# get sub-queries dictionary
parser.subqueries
# {"a": "SELECT std.task_id FROM some_task_detail std WHERE std.STATUS = 1",
#  "b": "SELECT st.task_id FROM some_task st WHERE task_type_id = 80"}


# get names/ aliases of sub-queries / derived tables
parser.subqueries_names
# ["a", "b"]

# note that columns coming from sub-queries are resolved to real columns
parser.columns
#["some_task_detail.task_id", "some_task_detail.STATUS", "some_task.task_id", 
# "task_type_id"]

# same applies for columns_dict, note the join columns are resolved
parser.columns_dict
#{'join': ['some_task_detail.task_id', 'some_task.task_id'],
# 'select': ['some_task_detail.task_id', 'some_task.task_id'],
# 'where': ['some_task_detail.STATUS', 'task_type_id']}

['some_task_detail', 'some_task']

{'a': 'SELECT std.task_id FROM some_task_detail std WHERE std.STATUS = 1',
 'b': 'SELECT st.task_id FROM some_task st WHERE task_type_id = 80'}

['a', 'b']

['some_task_detail.task_id',
 'some_task_detail.STATUS',
 'some_task.task_id',
 'task_type_id']

{'select': ['some_task_detail.task_id', 'some_task.task_id'],
 'where': ['some_task_detail.STATUS', 'task_type_id'],
 'join': ['some_task_detail.task_id', 'some_task.task_id']}

## Queries normalization and comments extraction

In [31]:
parser = Parser('SELECT /* Test */ foo FROM bar WHERE id in (1, 2, 56)')

parser.tables

# generalize query
parser.generalize
# 'SELECT foo FROM bar WHERE id in (XYZ)'

# remove comments
parser.without_comments
# 'SELECT foo FROM bar WHERE id in (1, 2, 56)'

# extract comments
parser.comments
# ['/* Test */']

['bar']

'SELECT foo FROM bar WHERE id in (XYZ)'

'SELECT foo FROM bar WHERE id in (1, 2, 56)'

['/* Test */']