In [None]:
import sqlparse
import pandas as pd
sqlparse.split('select * from (select * from bar);')

In [36]:
import sqlparse
from sqlparse.sql import IdentifierList, Identifier
from sqlparse.tokens import Keyword, DML


def is_subselect(parsed):
    if not parsed.is_group:
        return False
    for item in parsed.tokens:
        if item.ttype is DML and item.value.upper() == 'SELECT':
            return True
    return False


def extract_from_part(parsed):
    from_seen = False
    for item in parsed.tokens:
        if from_seen:
            if is_subselect(item):
                for x in extract_from_part(item):
                    yield x
            elif item.ttype is Keyword:
                raise StopIteration
            else:
                yield item
        elif item.ttype is Keyword and item.value.upper() == 'FROM':
            from_seen = True


def extract_table_identifiers(token_stream):
    for item in token_stream:
        if isinstance(item, IdentifierList):
            for identifier in item.get_identifiers():
                yield identifier.get_name()
        elif isinstance(item, Identifier):
            yield item.get_name()
        # It's a bug to check for Keyword here, but in the example
        # above some tables names are identified as keywords...
        elif item.ttype is Keyword:
            yield item.value


def extract_tables(sql):
    stream = extract_from_part(sqlparse.parse(sql)[0])
    return list(extract_table_identifiers(stream))


In [4]:
sql_text = pd.read_csv('slow_log_sql_text.csv',  error_bad_lines=False)

Skipping line 464: expected 12 fields, saw 62
Skipping line 679: expected 12 fields, saw 42
Skipping line 2621: expected 12 fields, saw 22
Skipping line 5351: expected 12 fields, saw 32
Skipping line 10460: expected 12 fields, saw 22
Skipping line 10461: expected 12 fields, saw 32



In [67]:
x = sql_text.sql_text.loc[10]

In [68]:
x

"SELECT  num_trials, setup_email, round(setup_email / num_trials * 100, 2) as setup_email_perc,   invited_agents, round(invited_agents / num_trials * 100, 2) as invited_agents_perc,   num_trials_gsuite, setup_email_gsuite, round(setup_email_gsuite / num_trials_gsuite * 100, 2) as setup_email_gsuite_perc,   invited_agents_gsuite, round(invited_agents_gsuite / num_trials_gsuite * 100, 2) as invited_agents_gsuite_perc FROM   (SELECT COUNT(*) as num_trials   FROM redshift_account r   JOIN brand_routes b USING (run_at, account_id)   JOIN accounts_settings s USING (run_at, account_id)   WHERE r.data_collection_status = 'Current'     AND r.derived_account_type = 'Trial'     AND b.gam_domain IS NULL     AND s.has_google_apps_admin = 0   ) AS t,    (SELECT COUNT(*) as setup_email   FROM redshift_account r   JOIN brand_routes b USING (run_at, account_id)   JOIN accounts_settings s USING (run_at, account_id)   JOIN support_addresses e USING (run_at, account_id)   WHERE r.data_collection_status = 

In [62]:
sql_text['parsed'] = ""
for i in x:
    print sqlparse.parse(i)


(<Statement 'select...' at 0x106FC45D0>,)
(<Statement 'select...' at 0x106FC46D0>,)
(<Statement 'SELECT...' at 0x1090593D0>,)
(<Statement 'SELECT...' at 0x109069850>,)
(<Statement 'SELECT...' at 0x109069950>,)
(<Statement 'select...' at 0x109069A50>,)
(<Statement 'select...' at 0x10905F750>,)
(<Statement 'SELECT...' at 0x109069A50>,)
(<Statement 'SELECT...' at 0x109059650>,)
(<Statement 'SELECT...' at 0x109059250>,)
(<Statement 'SELECT...' at 0x10905EBD0>,)
(<Statement 'select...' at 0x109061250>,)
(<Statement 'SELECT...' at 0x10A553650>,)
(<Statement 'Select...' at 0x10A55AAD0>,)
(<Statement 'Select...' at 0x10A55ACD0>,)
(<Statement 'Select...' at 0x10A55AED0>,)
(<Statement 'select...' at 0x10A561150>,)
(<Statement 'Select...' at 0x10A56A1D0>,)
(<Statement 'select...' at 0x10A56A3D0>,)
(<Statement 'Select...' at 0x10A571250>,)


In [63]:
for i in x:
    if __name__ == '__main__':
        sql = i
        tables = ', '.join(extract_tables(sql))
        print('Tables: {0}'.format(tables))

Tables: active_agents
Tables: accounts
Tables: hc_search_metrics
Tables: `redshift_account`
Tables: `daily_push_to_sfdc`
Tables: ai
Tables: jig_metrics
Tables: r
Tables: r
Tables: r
Tables: t, s, i, t_gsuite, s_gsuite, i_gsuite
Tables: ai
Tables: r
Tables: tickets
Tables: accounts
Tables: accounts
Tables: 
Tables: tickets
Tables: r
Tables: roles
