In [None]:
import pandas as pd

!pip install sqlparse --user --upgrade
!pip install sqlglot --user --upgrade

import sqlparse

import sqlglot
import sqlglot.expressions as exp
from sqlglot import parse_one
from sqlglot.optimizer import optimize
from sqlglot import optimizer
from sqlglot.errors import OptimizeError
from sqlglot import lineage

In [None]:
#helper functions
#find all alias from cte and related table and schema
def obtain_table_alias_type(sql):
    
    for alias in parse_one(sql, dialect="redshift").find_all(exp.TableAlias):
        
        print(f"alias => {alias.this.this} | alias_table_type => {type(alias.parent_select)}" )
        
        break
        
#find schema by name
def find_schema_by_table_name(table_name):
    
    for table in parse_one(sql, dialect="redshift").find_all(exp.Table):
        
        if (table_name == table.name):
            return str(table.args['db'])
            
def obtain_list_column_table(sql):
    
    column_l = []
    table_l = []
    alias_l = []
    
    for column in parse_one(sql, dialect="redshift").find_all(exp.Column):
        
        column_l.append(column.name)
        table_l.append(column.table)
        #print(column.key)
        #print(f"Column => {column.name} | DB => {column.table}" )
    
    #in case none of the field has table information, the only table in the sql will be the source table
    if(all(elem == '' for elem in table_l)):
        
        tablename = ''
        
        for table in parse_one(sql, dialect="redshift").find_all(exp.Table):
            
            if(table.name != ''):
                tablename = table.name
    
        for n in range(len(table_l)):
        
            table_l[n] = tablename
    
    return column_l, table_l

#find all alias from cte and related table and schema
def obtain_list_table_alias(sql):
    
    unalias_name = []
    alias_l = []
    related_table_l = []
    alias_type_ = []
    schema_l = []
    
    for alias in parse_one(sql, dialect="redshift").find_all(exp.TableAlias):
        
        table_t = []
        #if it is cte
        if (alias.parent.name == ''):
            alias_type_.append(type(alias.parent.args['this']))
            alias_l.append(alias.this.this)
            column_t, table_t = obtain_list_column_table(str(alias.parent))
            related_table_l.append(list(dict.fromkeys(table_t)))
            unalias_name.append(alias.parent.name)
            
            schema = []
            
            for table in list(dict.fromkeys(table_t)):
                
                schema.append(find_schema_by_table_name(table))
                
            schema_l.append(schema)
            
        #if it is normal table
        else:
            alias_type_.append('')
            related_table_l.append('')
            alias_l.append(alias.this.this)
            unalias_name.append(alias.parent.name)
            
            schema_l.append(find_schema_by_table_name(alias.parent.name))
        
        #print(f"Column => {alias_l} | DB => {related_table_l}" )
        
    return alias_l, related_table_l, alias_type_, unalias_name, schema_l

def find_column_originated(sql):

    column_l = []
    table_l = []
    originated_l = []
    
    group_column_l = []
    group_table_l = []
    group_originated_l = []
    #print(sql)
    
    #need to use parse instead of parse_one
    for expression in sqlglot.parse(sql):
        #print(expression.args)
        
        keysList = list(expression.args.keys())
        #print(keysList)
    
        for key in keysList:
            if expression.args[key] != None:
                
                #find all expression/join key
                if type(expression.args[key]) is list:
                    for objecto in expression.args[key]:
                        for column in objecto.find_all(exp.Column):
                            #print(f"Column => {column.name} | type => {key}" )
                            column_l.append(column.name)
                            table_l.append(column.table)
                            originated_l.append(key)
                else:
                    #list all group by items
                    if key == 'group':
                        for groupby_clause in expression.args[key].expressions:
                            if (type(groupby_clause)==sqlglot.expressions.Column):
                                group_column_l.append(groupby_clause.this)
                                group_table_l.append(groupby_clause.table)
                            else:
                                group_column_l.append(groupby_clause)
                                group_table_l.append(None)
                            group_originated_l.append('group by')
    
    #combine into one single dataframe
    dict = {'output_column': column_l, 'table': table_l, 'action': originated_l} 
    df_relationship = pd.DataFrame(dict)
    dict = {'output_column': group_column_l, 'table': group_table_l, 'action': group_originated_l}
    df_group = pd.DataFrame(dict)
    df_relationship = df_relationship.append(df_group)
    
    return df_relationship

#just in case if there is union
def deep_find_column_originated(sql, table_name):
    
    sqlglot_ = parse_one(sql)
    if (type(sqlglot_)==sqlglot.expressions.Union):
        df_union_1 = find_column_originated(str(sqlglot_.args['this']))
        df_union_2 = find_column_originated(str(sqlglot_.args['expression']))
        
        df_union_1['union'] = ' left union'
        df_union_2['union'] = ' right union'
        
        df_ = pd.concat([df_union_1, df_union_2], ignore_index=True)
        
    else:
        df_ = find_column_originated(sql)
        df_['union'] = None
        
    df_['destined_table'] = table_name
    return df_ 


In [3]:
with open('stored procedure/sp_otp_po_cut_level.txt', 'r') as f:
    #remove illegal words
    text = f.read().replace('~', '!=')
    text = text.replace('#', '')
    text = text.strip()
    lines = text.split(';')

In [4]:
sql_2_b_processed = []
fail_2_processed = []
stored_procedure_called = []

In [5]:
for sql in lines:
    
    if ('insert' in sql.lower() or 'update' in sql.lower()): #grab all building block to build the database
        
        sql_2_b_processed.append(sql)
        
    elif ('call' in sql.lower()): #to grap all procedure called within procedure
        
        stored_procedure_called.append(sql)
        
    else: 
    
        fail_2_processed.append(sql)
        
len(sql_2_b_processed)

53

In [6]:
sql = '''

insert into qma_datamart.otp_po_cut_level_wow_snapshots(snapshot_created_date,
                                                        snapshot_name,
                                                        country_of_origin,
                                                        po_cut,
                                                        style,
                                                        color,
                                                        style_description,
                                                        goods_description,
                                                        po_issue_date,
                                                        original_crd_at_origin,
                                                        revised_crd_at_origin,
                                                        actual_crd_at_origin,
                                                        local_currency,
                                                        unit_price,
                                                        season,
                                                        sourcing_office,
                                                        source_system,
                                                        dc_code,
                                                        po_type,
                                                        purchasing_group,
                                                        sbu,
                                                        sub_sbu,
                                                        product_line,
                                                        purchasing_company,
                                                        vendor_ffc,
                                                        vendor_group_name,
                                                        factory_ffc,
                                                        delay_reason,
                                                        shipment_terms,
                                                        po_location,
                                                        destination_country,
                                                        vendor_name,
                                                        factory_name,
                                                        report_order_qty_lum,
                                                        order_amount_local_currency,
                                                        shipped_qty_lum,
                                                        shipped_amt_local_currency,
                                                        booked_qty_lum,
                                                        booked_amt_local_currency,
                                                        effective_crd,
                                                        effective_qty,
                                                        order_type_calculated,
                                                        qty_per_pack,
                                                        issample,
                                                        iscustomorder,
                                                        managing_office,
                                                        brand,
                                                        major_product_category_name,
                                                        business_unit,
                                                        brand2,
                                                        market,
                                                        order_amount_usd,
                                                        shipped_amt_usd,
                                                        booked_amt_usd,
                                                        effective_amt_usd,
                                                        exchange_rate,
                                                        product_supply_group,
                                                        current_date,
                                                        src_sys,
                                                        table_names,
                                                        effective_crd_year,
                                                        effective_crd_month,
                                                        misc3,
                                                        misc26,
                                                        misc33,
                                                        purchasing_company_code,
                                                        days_late,
                                                        exch_rate_date,
                                                        od_misc_flag,
                                                        tbr_defect_flag,
                                                        po_creation_date,
                                                        balance_qty,
                                                        shipment_id,
                                                        fr_release,
                                                        factory_designation,
                                                        hts_code,
                                                        dc_name,
                                                        erp_factory_code,
                                                        erp_vendor_code,
                                                        revised_in_dc_date,
                                                        ship_mode,
                                                        po_acknowledgement_date,
                                                        po_complete_status,
                                                        po_season,
                                                        shipment_id_closing_date,
                                                        actual_ship_date,
                                                        first_actual_crd,
                                                        first_shipment_id,
                                                        first_actual_ship_date,
                                                        shipment_status,
                                                        greenlight_date,
                                                        hts_product_category,
                                                        hts_product_type,
                                                        crc_code,
                                                        crc_description,
                                                        costing_season,
                                                        master_po,
                                                        orders_to_be_produced_qty,
                                                        orders_to_be_produced_amt_usd,
                                                        actual_produced_qty,
                                                        actual_produced_amt_usd,
                                                        freight_paid_by,
                                                        exit_cnty_port,
                                                        ship_to_name,
                                                        original_requested_production_end_date,
                                                        latest_confirmed_production_end_date,
                                                        vf_est_dc_arrival_date,
                                                        brand_requested_in_dc_date,
                                                        buy_month,
                                                        unit_price_average_in_usd,
                                                        container_number,
                                                        shipment_closed_by,
                                                        requested_delivery_date,
                                                        po_closure_eligibility,
                                                        discharge_at_port_of_destination,
                                                        crc_owner,
                                                        asn_released_to_3pl,
                                                        revised_production_end_date,
                                                        original_brands_requested_crd,
                                                        certified_pocut_flag,
                                                        fob_price,
                                                        fob_duties_rate,
                                                        material_in_house_planned_date,
                                                        material_in_house_revised_plan_date,
                                                        material_in_house_actual_date,
                                                        cutting_planned_date,
                                                        cutting_revised_planned_date,
                                                        cutting_actual_date,
                                                        sewing_stitching_planned_date,
                                                        sewing_stitching_revised_plan_date,
                                                        sewing_stitching_actual_date,
                                                        finishing_assembly_planned_date,
                                                        finishing_assembly_revised_plan_date,
                                                        finishing_assembly_actual_date,
                                                        production_priority_flag,
                                                        order_collaboration_line_status_latest,
                                                        plan_to_ship_qty_lum,
                                                        packing_list_qty,
                                                        split_from_po
                                                        ,material_in_house_qty
                                                        ,cutting_qty
                                                        ,sewing_stitching_qty
                                                        ,finishing_assembly_qty
														,payment_Terms
														,Capacity_Group
														,Marketing_Program
														,certprintdate
														,SpeedProgram
														,ProductDevelopmentType
														,GlobalBrandArchitecture
														,VF_Fiscal_Year_Original_crd
														,VF_Fiscal_Year_Month_Original_crd
														,season_cleaned
														,fob_price_usd
														,pl_uom
                                                        ,asn_uom
                                                        ,preferential_duty_rate
                                                        ,standard_duty_rate
														,freight_cost
														,applied_duty_rate
														,calculated_duty_amt
														,fully_landed_cost
														,cbm_pc_brand_cat
														,standard_duty_specific_rate
														,mdg_vendor_created_on
														,mdg_vendor_purchasing_block
														,mdg_factory_created_on
														,mdg_factory_purchasing_block
														,first_confirmed_crc
														)
                                                 select cast(convert_timezone('Asia/Shanghai', getdate()) as date) as snapshot_created_date,
                                                        replace(cast(cast(convert_timezone('Asia/Shanghai', getdate()) as date) as varchar(20)),'-','')||' Snapshot' as snapshot_name,
                                                        otp.country_of_origin,
                                                        otp.po_cut,
                                                        otp.style,
                                                        otp.color,
                                                        otp.style_description,
                                                        otp.goods_description,
                                                        otp.po_issue_date,
                                                        otp.original_crd_at_origin,
                                                        otp.revised_crd_at_origin,
                                                        otp.actual_crd_at_origin,
                                                        otp.local_currency,
                                                        otp.unit_price,
                                                        otp.season,
                                                        otp.sourcing_office,
                                                        otp.source_system,
                                                        otp.dc_code,
                                                        otp.po_type,
                                                        otp.purchasing_group,
                                                        otp.sbu,
                                                        otp.sub_sbu,
                                                        otp.product_line,
                                                        otp.purchasing_company,
                                                        otp.vendor_ffc,
                                                        otp.vendor_group_name,
                                                        otp.factory_ffc,
                                                        otp.delay_reason,
                                                        otp.shipment_terms,
                                                        otp.po_location,
                                                        otp.destination_country,
                                                        otp.vendor_name,
                                                        otp.factory_name,
                                                        otp.report_order_qty_lum,
                                                        otp.order_amount_local_currency,
                                                        otp.shipped_qty_lum,
                                                        otp.shipped_amt_local_currency,
                                                        otp.booked_qty_lum,
                                                        otp.booked_amt_local_currency,
                                                        otp.effective_crd,
                                                        otp.effective_qty,
                                                        otp.order_type_calculated,
                                                        otp.qty_per_pack,
                                                        otp.issample,
                                                        otp.iscustomorder,
                                                        otp.managing_office,
                                                        otp.brand,
                                                        otp.major_product_category_name,
                                                        otp.business_unit,
                                                        otp.brand2,
                                                        otp.market,
                                                        otp.order_amount_usd,
                                                        otp.shipped_amt_usd,
                                                        otp.booked_amt_usd,
                                                        otp.effective_amt_usd,
                                                        otp.exchange_rate,
                                                        otp.product_supply_group,
                                                        otp.current_date,
                                                        otp.src_sys,
                                                        otp.table_names,
                                                        otp.effective_crd_year,
                                                        otp.effective_crd_month,
                                                        otp.misc3,
                                                        otp.misc26,
                                                        otp.misc33,
                                                        otp.purchasing_company_code,
                                                        otp.days_late,
                                                        otp.exch_rate_date,
                                                        otp.od_misc_flag,
                                                        otp.tbr_defect_flag,
                                                        otp.po_creation_date,
                                                        otp.balance_qty,
                                                        otp.shipment_id,
                                                        otp.fr_release,
                                                        otp.factory_designation,
                                                        otp.hts_code,
                                                        otp.dc_name,
                                                        otp.erp_factory_code,
                                                        otp.erp_vendor_code,
                                                        otp.revised_in_dc_date,
                                                        otp.ship_mode,
                                                        otp.po_acknowledgement_date,
                                                        otp.po_complete_status,
                                                        otp.po_season,
                                                        otp.shipment_id_closing_date,
                                                        otp.actual_ship_date,
                                                        otp.first_actual_crd,
                                                        otp.first_shipment_id,
                                                        otp.first_actual_ship_date,
                                                        otp.shipment_status,
                                                        otp.greenlight_date,
                                                        otp.hts_product_category,
                                                        otp.hts_product_type,
                                                        otp.crc_code,
                                                        otp.crc_description,
                                                        otp.costing_season,
                                                        otp.master_po,
                                                        otp.orders_to_be_produced_qty,
                                                        otp.orders_to_be_produced_amt_usd,
                                                        otp.actual_produced_qty,
                                                        otp.actual_produced_amt_usd,
                                                        otp.freight_paid_by,
                                                        otp.exit_cnty_port,
                                                        otp.ship_to_name,
                                                        otp.original_requested_production_end_date,
                                                        otp.latest_confirmed_production_end_date,
                                                        otp.vf_est_dc_arrival_date,
                                                        otp.brand_requested_in_dc_date,
                                                        otp.buy_month,
                                                        otp.unit_price_average_in_usd,
                                                        otp.container_number,
                                                        otp.shipment_closed_by,
                                                        otp.requested_delivery_date,
                                                        otp.po_closure_eligibility,
                                                        otp.discharge_at_port_of_destination,
                                                        otp.crc_owner,
                                                        otp.asn_released_to_3pl,
                                                        otp.revised_production_end_date,
                                                        otp.original_brands_requested_crd,
                                                        otp.certified_pocut_flag,
                                                        otp.fob_price,
                                                        otp.fob_duties_rate,
                                                        otp.material_in_house_planned_date,
                                                        otp.material_in_house_revised_plan_date,
                                                        otp.material_in_house_actual_date,
                                                        otp.cutting_planned_date,
                                                        otp.cutting_revised_planned_date,
                                                        otp.cutting_actual_date,
                                                        otp.sewing_stitching_planned_date,
                                                        otp.sewing_stitching_revised_plan_date,
                                                        otp.sewing_stitching_actual_date,
                                                        otp.finishing_assembly_planned_date,
                                                        otp.finishing_assembly_revised_plan_date,
                                                        otp.finishing_assembly_actual_date,
                                                        otp.production_priority_flag,
                                                        otp.order_collaboration_line_status_latest,
                                                        otp.plan_to_ship_qty_lum,
                                                        otp.packing_list_qty,
                                                        otp.split_from_po,
                                                        otp.material_in_house_qty,
                                                        otp.cutting_qty,
                                                        otp.sewing_stitching_qty,
                                                        otp.finishing_assembly_qty,
														otp.payment_Terms,
														otp.Capacity_Group,
														otp.Marketing_Program,
														otp.certprintdate
														,otp.SpeedProgram
														,otp.ProductDevelopmentType
														,otp.GlobalBrandArchitecture
														,otp.VF_Fiscal_Year_Original_crd
														,otp.VF_Fiscal_Year_Month_Original_crd
														,otp.season_cleaned
														,otp.fob_price_usd
														,otp.pl_uom
                                                        ,otp.asn_uom
                                                        ,otp.preferential_duty_rate
                                                        ,otp.standard_duty_rate
														,otp.freight_cost
														,otp.applied_duty_rate
														,otp.calculated_duty_amt
														,otp.fully_landed_cost
														,otp.cbm_pc_brand_cat
														,otp.standard_duty_specific_rate
														,otp.mdg_vendor_created_on
														,otp.mdg_vendor_purchasing_block
														,otp.mdg_factory_created_on
														,otp.mdg_factory_purchasing_block
														,otp.first_confirmed_crc
														from qma_datamart.otp_po_cut_Level otp
                                                   where
                                                   case when cast(convert_timezone('Asia/Shanghai', getdate()) as date) >= cast(date_part_year(cast(convert_timezone('Asia/Shanghai', getdate()) as date))||'-04-01' as date)
                                                        then original_crd_at_origin >= cast(date_part_year(cast(convert_timezone('Asia/Shanghai', getdate()) as date))-1||'-10-01' as date) and original_crd_at_origin <= cast(date_part_year(cast(convert_timezone('Asia/Shanghai', getdate()) as date))+1||'-09-30' as date)
                                                        when cast(convert_timezone('Asia/Shanghai', getdate()) as date) < cast(date_part_year(cast(convert_timezone('Asia/Shanghai', getdate()) as date))||'-04-01' as date)
                                                        then original_crd_at_origin >= cast(date_part_year(cast(convert_timezone('Asia/Shanghai', getdate()) as date))-2||'-10-01' as date) and original_crd_at_origin <= cast(date_part_year(cast(convert_timezone('Asia/Shanghai', getdate()) as date))||'-09-30' as date)
                                                   end;
'''

In [7]:
sql = sqlparse.format(sql, strip_comments=True).strip()

print(sql)

insert into qma_datamart.otp_po_cut_level_wow_snapshots(snapshot_created_date,
                                                        snapshot_name,
                                                        country_of_origin,
                                                        po_cut,
                                                        style,
                                                        color,
                                                        style_description,
                                                        goods_description,
                                                        po_issue_date,
                                                        original_crd_at_origin,
                                                        revised_crd_at_origin,
                                                        actual_crd_at_origin,
                                                        local_currency,
                                                       

In [8]:
#prepocessing
#change update and insert to respective select statement
#addtionally if it is insert: prepare list of output table columns

#sql = sql_2_b_processed[0]

statement = sqlparse.parse(sql)[0]
print(f"statement: {statement}")


token = ''

if(statement.get_type() == "INSERT"):
    
    token = 'insert'
    
    for expression in sqlglot.parse(sql):
        print(expression)
        table_name = str(expression.args['this'].this)
        columns = []
        for column in expression.args['this'].expressions:
            print(column)
            columns.append(str(column))
        expression_ = str(expression.expression)
    
    #recompose the list of column into a single string: REAL Columns for the datamart
    print(columns)
    insert_columns = ', '.join(columns)[:-2]
    
    sql = expression_
    #print(sql)
    
elif(statement.get_type() == "UPDATE"):
    
    token = 'update'
    
    for expression in sqlglot.parse(sql):
        table_name = str(expression.args['this'])
        field_value_name = expression.expressions
    
        fieldname = []
        value = []
    
        for pair in field_value_name:
            fieldname.append(str(pair.this))
            value.append(str(pair.expression))
        
        from_ = str(expression.args['from'])
        where = str(expression.args['where'])

        print(from_)
        
        sql_ = ''
    
        for value_, fieldname_ in zip(value, fieldname):
    
            sql_ = sql_ + ' ' + value_ + ' AS ' + fieldname_ + ','
    
        sql_ = sql_[:-1]

        sql_ = 'SELECT ' + sql_ + ' FROM ' + table_name + ' JOIN ' + from_.replace('FROM', ' ', 1) + ' ON (' + where.replace('WHERE', ' ', 1) + ')' 
        sql = sql_
    
        update_column = fieldname
        all_column = update_column

elif(statement.get_type() == "SELECT"):
        
        token = 'select'
        
        all_column = []
        
        for expression in sqlglot.parse(sql):
            table_name = str(expression.args['from'].this)
        
        for expression in sqlglot.parse_one(sql):
            print(expression)
            all_column.append(expression.args['this'].this)
        
        
        sql = sql
        
elif(statement.get_type() == "CREATE"):
    
    token = 'create'
    
    for expression in sqlglot.parse(sql):
        table_name = str(expression.args['this'].this)
        print(table_name)
        columns = []
        for column in expression.args['this'].expressions:
            columns.append(column.this)
        expression_ = str(expression.expression)
        
        print(expression_)
        
else:
    print('no action.')
    
#print(sql)

statement: insert into qma_datamart.otp_po_cut_level_wow_snapshots(snapshot_created_date,
                                                        snapshot_name,
                                                        country_of_origin,
                                                        po_cut,
                                                        style,
                                                        color,
                                                        style_description,
                                                        goods_description,
                                                        po_issue_date,
                                                        original_crd_at_origin,
                                                        revised_crd_at_origin,
                                                        actual_crd_at_origin,
                                                        local_currency,
                                            

In [9]:
for expression in sqlglot.parse(sql):
        print(expression.args)
        table_name = str(expression.args['this'])
        field_value_name = expression.expressions
        
        print(f"table name :{table_name}")
        print(f"field_value_name : {field_value_name}")
        
        fieldname = []
        value = []
    
        for pair in field_value_name:
            fieldname.append(str(pair.this))
            value.append(str(pair.expression))
        
        from_ = str(expression.args['from'])
        where = str(expression.args['where'])

        print(f"from :{from_}")
        print(f"where :{where}")
        
        sql_ = ''
        
        for value_, fieldname_ in zip(value, fieldname):
    
            sql_ = sql_ + ' ' + value_ + ' AS ' + fieldname_ + ','
    
        print(f"sql : {sql_}")
        
        sql_ = sql_[:-1]

        sql_ = 'SELECT ' + sql_ + ' FROM ' + table_name + ' JOIN ' + from_.replace('FROM', ' ', 1) + ' ON (' + where.replace('WHERE', ' ', 1) + ')' 
        sql = sql_
    
        update_column = fieldname
        all_column = update_column


{'kind': None, 'hint': None, 'distinct': None, 'expressions': [Alias(
  this=Cast(
    this=AtTimeZone(
      this=Anonymous(
        this=GETDATE),
      zone=Literal(this=Asia/Shanghai, is_string=True)),
    to=DataType(this=Type.DATE, nested=False)),
  alias=Identifier(this=snapshot_created_date, quoted=False)), Alias(
  this=DPipe(
    this=Anonymous(
      this=REPLACE,
      expressions=[
        Cast(
          this=Cast(
            this=AtTimeZone(
              this=Anonymous(
                this=GETDATE),
              zone=Literal(this=Asia/Shanghai, is_string=True)),
            to=DataType(this=Type.DATE, nested=False)),
          to=DataType(
            this=Type.VARCHAR,
            expressions=[
              DataTypeParam(
                this=Literal(this=20, is_string=False))],
            nested=False)),
        Literal(this=-, is_string=True),
        Literal(this=, is_string=True)]),
    expression=Literal(this=Snapshot, is_string=True),
    safe=True),
  alias

KeyError: 'this'

In [None]:
all_column

In [None]:
#find the position of the column in the sql, whether it is in select/update/insert or join clause or group by
df_relation = deep_find_column_originated(sql, table_name)

for subquery in parse_one(sql).find_all(exp.Subquery):
    
    df_subquery = deep_find_column_originated(str(subquery.args['this']), subquery.alias)
    df_relation = pd.concat([df_relation, df_subquery], ignore_index=True)

df_relation

groupby_df = df_relation.loc[df_relation['action']=='group by']
column_df = df_relation.loc[df_relation['action']=='expressions']
join_df = df_relation.loc[df_relation['action']=='joins']

join_df

In [None]:
df_relation

In [None]:
column = 'current_date'

print(sql)

node_list = []
source_list = []
expression_list = []
column_list = []
alias_list = []
reference_list = []

for node in lineage.lineage(column, sql).walk():
    
    node_ = str(node.name)
    
    #print(node_)
    source = str(node.source)
    alias = str(node.expression.alias)
    expression = str(node.expression)
    #depth = expression.depth
    
    #full source
    node_list.append(column)
    source_list.append(source)
    alias_list.append(alias)
    expression_list.append(expression)
    column_list.append(node_)
    reference_list.append(node.reference_node_name)
    
# Convert to DataFrame
dict = {'node': node_list, 'output_node': column_list, 'alias': alias_list, 'reference node': reference_list, 'logic': expression_list, 'full source': source_list} 
   
df_i = pd.DataFrame(dict)
df_i.iloc[0, 3] = 'main'
df_i

In [None]:
logics = []

if (token == 'insert'):
    logics.append(df_i.iloc[0]['logic'])
    list_1 = df_i.loc[df_i['output_node'].str.isnumeric()]['logic'].to_list()
else:
    logics.append(df_i.iloc[0]['full source'])
    list_1 = df_i.loc[df_i['output_node'].str.isnumeric()]['full source'].to_list()

logics = logics + list_1
logics

In [None]:
column_f = []
table_f = []

for logic in logics:
    
    column_, table_ = obtain_list_column_table(logic)
    column_f = column_f + column_
    table_f = table_f + table_
    
dict = {'field_name': column_f, 'table_name': table_f} 
df_temp = pd.DataFrame(dict)
df_temp

table_list, table_component, logic_type, original_table, table_schema = obtain_list_table_alias(sql)

# Convert to DataFrame
dict = {'table_name': table_list, 'table component': table_component, 'logic': logic_type, 'original table': original_table, 'schema': table_schema} 
   
table_alias = pd.DataFrame(dict)
table_mapping = pd.merge(df_temp, table_alias, how='left', on='table_name')
table_mapping = table_mapping[['table_name', 'table component', 'logic', 'original table', 'schema']]
table_mapping = table_mapping[~table_mapping.astype(str).duplicated()]
table_mapping

In [None]:
#clean up
if (token=='insert'):
    df_i['column_l'], df_i['table_l'] = zip(*df_i['logic'].map(obtain_list_column_table))
    df_i.loc[df_i['logic']==df_i['full source'], 'column_l'] = ''
    df_i.loc[df_i['logic']==df_i['full source'], 'table_l'] = ''
elif (token=='update' or token=='select'):
    df_i['column_l'], df_i['table_l'] = zip(*df_i['full source'].map(obtain_list_column_table))
    df_i.loc[df_i['logic']==df_i['full source'], 'column_l'] = ''
    df_i.loc[df_i['logic']==df_i['full source'], 'table_l'] = ''
else:
    print('no action.')

#make a list of all component of a node 
main_df = df_i.loc[df_i['reference node']!='']
main_df = main_df.explode(['column_l', 'table_l'])
main_df['subnode'] = main_df['table_l'] + '.' + main_df['column_l']

#produce field dictionary
df_i.loc[df_i['output_node'].str.isnumeric(), 'output_node'] = df_i['reference node'] + '.' + df_i['alias']
df_field = df_i[['output_node', 'logic']].drop_duplicates(ignore_index=True)
df_field = df_field.rename(columns={"output_node": "field", "logic": "original field",}, errors="raise")

In [None]:
main_df

In [None]:
#final table cleaning
final_df = pd.merge(main_df, df_field, how='left', left_on='subnode', right_on='field')
final_df = final_df[['node', 'output_node', 'alias', 'reference node', 'logic', 'full source', 'table_l', 'column_l', 'original field']]

final_df = pd.merge(final_df, table_mapping, how='left', left_on='table_l', right_on='table_name')

final_df['original field'] = final_df['original field'].fillna('noSchema')
final_df.loc[:, 'temp_schema'] = final_df['original field'].map(lambda x: x.split('.')[0])
final_df.loc[final_df['table component'].isna(), 'schema'] = final_df['temp_schema']
final_df.loc[final_df['table component'].isna(), 'original table'] = final_df['table_name']

final_df = final_df[['node', 'reference node', 'alias', 'logic_x', 
          'full source', 'schema', 'table_l', 'table component', 'original table', 'column_l']].fillna('')

In [None]:
sql

In [None]:
def obtain_list_table_alias(sql):
    
    n = 0
    print(n)
    unalias_name = []
    alias_l = []
    related_table_l = []
    alias_type_ = []
    schema_l = []
    
    for alias in parse_one(sql, dialect="redshift").find_all(exp.TableAlias):
        
        print(alias)
        
        table_t = []
        #if it is cte
        if (alias.parent.name == ''):
            print('here')
            alias_type_.append(type(alias.parent.args['this']))
            print(alias_type_)
            alias_l.append(alias.this.this)
            print(alias_l)
            column_t, table_t = obtain_list_column_table(str(alias.parent))
            print(column_t)
            print(table_t)
            related_table_l.append(list(dict.fromkeys(table_t)))
            print(related_table_l)
            unalias_name.append(alias.parent.name)
            
            schema = []
            
            for table in list(dict.fromkeys(table_t)):
                
                schema.append(find_schema_by_table_name(table))
                
            schema_l.append(schema)
            
        #if it is normal table
        else:
            alias_type_.append('')
            related_table_l.append('')
            alias_l.append(alias.this.this)
            unalias_name.append(alias.parent.name)
            
            schema_l.append(find_schema_by_table_name(alias.parent.name))
        
        #print(f"Column => {alias_l} | DB => {related_table_l}" )
    
    n = n+1
    return alias_l, related_table_l, alias_type_, unalias_name, schema_l

def obtain_list_column_table(sql):
    
    column_l = []
    table_l = []
    alias_l = []
    
    temp_table = ''
        
    for column in parse_one(sql, dialect="redshift").find_all(exp.Column):
        
        column_l.append(column.name)
        table_l.append(column.table)
        
    if(all(elem == '' for elem in table_l)):
        
        for table in parse_one(str(alias.parent), dialect="redshift").find_all(exp.Table):
            if(table.name != ''):
                tablename = table.name
    
    for n in range(len(table_l)):
        
        table_l[n] = tablename
        #print(column.key)
        #print(f"Column => {column.name} | DB => {column.table}" )

    return column_l, table_l

In [None]:
table_list, table_component, logic_type, original_table, table_schema = obtain_list_table_alias(sql)

In [None]:
def get_query_columns(sql):
    stmt = sqlparse.parse(sql)[0]
    columns = []
    column_identifiers = []

    # get column_identifieres
    in_select = False
    for token in stmt.tokens:
        if isinstance(token, sqlparse.sql.Comment):
            continue
        if str(token).lower() == 'select':
            in_select = True
        elif in_select and token.ttype is None:
            for identifier in token.get_identifiers():
                column_identifiers.append(identifier)
            break

    # get column names
    for column_identifier in column_identifiers:
        columns.append(column_identifier.get_name())

    return columns

In [None]:
get_query_columns(sql)

In [None]:
for alias in parse_one(sql, dialect="redshift").find_all(exp.TableAlias):
    
    alias_type_ = []
    alias_l = []
    
    alias_type_.append(type(alias.parent.args['this']))
    print(alias_type_)
    alias_l.append(alias.this.this)
    print(alias_l)
    
    print(alias.parent)
    
    column_l = []
    table_l = []
    alias_l = []
    
    for column in parse_one(str(alias.parent), dialect="redshift").find_all(exp.Column):
        
        column_l.append(column.name)
        table_l.append(column.table)
        
    if(all(elem == '' for elem in table_l)):
        
        for table in parse_one(str(alias.parent), dialect="redshift").find_all(exp.Table):
            if(table.name != ''):
                tablename = table.name
    
    for n in range(table_l):
        
        table_l[n] = tablename
        