In [1]:
import pandas as pd

## Connect ot Graph Data Science

In [2]:
from graphdatascience import GraphDataScience

# Use Neo4j URI and credentials according to your setup
gds = GraphDataScience('neo4j://localhost', auth=('neo4j', 'neopharm'))

## Staging for ETL
1. Clear the graph of any existing data and indexes
2. read the source data
3. create helper functions
4. Set Neo4j Indexes


In [31]:
# Clear last graph - All data and schema attributes
gds.run_cypher('MATCH(n) DETACH DELETE n')
gds.run_cypher('CALL apoc.schema.assert({},{})')

Unnamed: 0,label,key,keys,unique,action
0,Table,tableId,[tableId],True,DROPPED
1,Schema,schema_Id,[schema_Id],True,DROPPED
2,Subquery,subqueryId,[subqueryId],True,DROPPED
3,StoredProcedure,SP_Id,[SP_Id],True,DROPPED
4,Destination_Field,destId,[destId],True,DROPPED
5,Intermediate_Field,intId,[intId],True,DROPPED
6,DestinationField,destfieldId,[destfieldId],True,DROPPED
7,IntermediateField,intfieldId,[intfieldId],True,DROPPED


In [4]:
# Create Indexes for table
gds.run_cypher('CREATE CONSTRAINT table_unique IF NOT EXISTS FOR (n:Table) REQUIRE n.tableId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT destination_field_unique IF NOT EXISTS FOR (n:Destination_Field) REQUIRE n.destId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT intermediate_field_unique IF NOT EXISTS FOR (n:Intermediate_Field) REQUIRE n.intId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT schema_unique IF NOT EXISTS FOR (n:Schema) REQUIRE n.schema_Id  IS UNIQUE')

gds.run_cypher('CREATE CONSTRAINT dest_field_unique IF NOT EXISTS FOR (n:DestinationField) REQUIRE n.destfieldId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT int_field_unique IF NOT EXISTS FOR (n:IntermediateField) REQUIRE n.intfieldId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT subquery_unique IF NOT EXISTS FOR (n:Subquery) REQUIRE n.subqueryId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT sp_unique IF NOT EXISTS FOR (n:StoredProcedure) REQUIRE n.SP_Id  IS UNIQUE')

In [29]:
df = pd.read_csv('destination_field_2_0.csv', dtype=str)
df

Unnamed: 0,table,fieldID,field_name,table.1,table_id,schema,Storedprocedured
0,otp_po_cut_level,df_001,country_of_origin,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
1,otp_po_cut_level,df_002,po_cut,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
2,otp_po_cut_level,df_003,style,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
3,otp_po_cut_level,df_004,color,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
4,otp_po_cut_level,df_005,style_description,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
...,...,...,...,...,...,...,...
130,otp_po_cut_level,df_131,mdg_vendor_purchasing_block,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
131,otp_po_cut_level,df_132,mdg_factory_created_on,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
132,otp_po_cut_level,df_133,mdg_factory_purchasing_block,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
133,otp_po_cut_level,df_134,fob_price,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level


In [30]:
sub_dict = df.loc[(df[f'fieldID'] != '?') & (df[f'field_name'] != '?') & (df[f'fieldID'].notna()),
                  ['fieldID', 'field_name', 'table', 'table_id', 'schema', 'Storedprocedured']].to_dict('records')
sub_dict

[{'fieldID': 'df_001',
  'field_name': 'country_of_origin',
  'table': 'otp_po_cut_level',
  'table_id': 'Table_01',
  'schema': 'qma_datamart',
  'Storedprocedured': 'sp_otp_po_cut_level'},
 {'fieldID': 'df_002',
  'field_name': 'po_cut',
  'table': 'otp_po_cut_level',
  'table_id': 'Table_01',
  'schema': 'qma_datamart',
  'Storedprocedured': 'sp_otp_po_cut_level'},
 {'fieldID': 'df_003',
  'field_name': 'style',
  'table': 'otp_po_cut_level',
  'table_id': 'Table_01',
  'schema': 'qma_datamart',
  'Storedprocedured': 'sp_otp_po_cut_level'},
 {'fieldID': 'df_004',
  'field_name': 'color',
  'table': 'otp_po_cut_level',
  'table_id': 'Table_01',
  'schema': 'qma_datamart',
  'Storedprocedured': 'sp_otp_po_cut_level'},
 {'fieldID': 'df_005',
  'field_name': 'style_description',
  'table': 'otp_po_cut_level',
  'table_id': 'Table_01',
  'schema': 'qma_datamart',
  'Storedprocedured': 'sp_otp_po_cut_level'},
 {'fieldID': 'df_006',
  'field_name': 'goods_description',
  'table': 'otp_po_c

In [7]:
# Load nodes and draw location at relationships
gds.run_cypher('''
    UNWIND $dest_fields AS a
    WITH a.fieldID AS Dest_fieldID, 
         a.field_name AS Dest_field_name, 
         a.table AS table_name,
         a.table_id AS table_ID, 
         a.schema AS schema,
         a.Storedprocedured AS stored_procedure
    MERGE(n0:Table {tableId: table_ID}) SET n0.table_name=table_name
    MERGE(n1:DestinationField {destfieldId: Dest_fieldID}) SET n1.Dest_field_name=Dest_field_name
    MERGE(n2:StoredProcedure {SP_Id: stored_procedure}) SET n2.stored_procedure=stored_procedure
    MERGE(n3:Schema {schema_Id: schema}) SET n3.schema=schema
    
    MERGE(n0)<-[:LOCATED_AT]-(n1)
    MERGE(n2)<-[:CREATED_IN]-(n1)
    MERGE(n3)<-[:INSIDE_OF]-(n0)
   
    RETURN count(n0), count(n1), count(n2), count(n3)
    ''', params={'dest_fields':sub_dict})

Unnamed: 0,count(n0),count(n1),count(n2),count(n3)
0,135,135,135,135


In [8]:
df = pd.read_csv('intermediate_field.csv', dtype=str)
df

Unnamed: 0,fieldID,field_name,table,tableID,schema
0,if_01,misc2,dim_descode,Table_05,spectrum_datalake
1,if_02,misc30,dim_descode,Table_05,spectrum_datalake
2,if_03,misc4,dim_descode,Table_05,spectrum_datalake
3,if_04,iso_cnty,dim_drbn_ctr,Table_06,qma
4,if_05,dc_name,dim_drbn_ctr,Table_06,qma
...,...,...,...,...,...
288,if_289,brand,cbb_mapping,Table_03,qma_datamart
289,if_290,product_supply_group,cbb_mapping,Table_03,qma_datamart
290,if_291,brand2,cbb_mapping,Table_03,qma_datamart
291,if_292,bu,cbb_mapping,Table_03,qma_datamart


In [9]:
sub_dict = df.loc[(df[f'fieldID'] != '?') & (df[f'field_name'] != '?') & (df[f'fieldID'].notna()),
                  ['fieldID', 'field_name', 'table', 'tableID', 'schema']].to_dict('records')
sub_dict

[{'fieldID': 'if_01',
  'field_name': 'misc2',
  'table': 'dim_descode',
  'tableID': 'Table_05',
  'schema': 'spectrum_datalake'},
 {'fieldID': 'if_02',
  'field_name': 'misc30',
  'table': 'dim_descode',
  'tableID': 'Table_05',
  'schema': 'spectrum_datalake'},
 {'fieldID': 'if_03',
  'field_name': 'misc4',
  'table': 'dim_descode',
  'tableID': 'Table_05',
  'schema': 'spectrum_datalake'},
 {'fieldID': 'if_04',
  'field_name': 'iso_cnty',
  'table': 'dim_drbn_ctr',
  'tableID': 'Table_06',
  'schema': 'qma'},
 {'fieldID': 'if_05',
  'field_name': 'dc_name',
  'table': 'dim_drbn_ctr',
  'tableID': 'Table_06',
  'schema': 'qma'},
 {'fieldID': 'if_06',
  'field_name': 'src_sys',
  'table': 'dim_fty_supr ',
  'tableID': 'Table_07',
  'schema': 'qma'},
 {'fieldID': 'if_07',
  'field_name': 'vendor_ffc',
  'table': 'dim_fty_supr ',
  'tableID': 'Table_07',
  'schema': 'qma'},
 {'fieldID': 'if_08',
  'field_name': 'vendor_grp_name',
  'table': 'dim_fty_supr ',
  'tableID': 'Table_07',
  '

In [10]:
# Load nodes and draw location at relationships
gds.run_cypher('''
    UNWIND $int_fields AS a
    WITH a.fieldID AS int_fieldID, 
         a.field_name AS int_field_name, 
         a.table AS table_name,
         a.tableID AS table_ID, 
         a.schema AS schema
    MERGE(n0:Table {tableId: table_ID}) SET n0.table_name=table_name
    MERGE(n1:IntermediateField {intfieldId: int_fieldID}) SET n1.int_field_name=int_field_name
    MERGE(n2:Schema {schema_Id: schema}) SET n2.schema=schema
    
    MERGE(n0)<-[:LOCATED_AT]-(n1)
    MERGE(n2)<-[:INSIDE_OF]-(n0)
   
    RETURN count(n0), count(n1), count(n2)
    ''', params={'int_fields':sub_dict})

Unnamed: 0,count(n0),count(n1),count(n2)
0,293,293,293


## Ingest Relationships
Relationships will consist of the freight forwarding steps as well as links representing connection between transportation segments and transfer between the inbound and output stage of the shipment

In [11]:
df = pd.read_csv('relationship.csv', dtype=str)
df

Unnamed: 0,start_id,end_id,type,stored_procedure,cte_level
0,if_186,df_96,EXTRACT,sp_otp_po_cut_level,first
1,if_228,df_38,EXTRACT,sp_otp_po_cut_level,first
2,if_256,df_75,EXTRACT,sp_otp_po_cut_level,first
3,df_75,df_75,CALCULATED,sp_otp_po_cut_level,first
4,df_93,df_75,CALCULATED,sp_otp_po_cut_level,first
...,...,...,...,...,...
521,if_160,if_92,JOIN,sp_otp_po_cut_level,third
522,if_156,if_92,JOIN,sp_otp_po_cut_level,third
523,if_179,if_92,JOIN,sp_otp_po_cut_level,third
524,if_181,if_92,JOIN,sp_otp_po_cut_level,third


In [12]:
#ingest extract field (case Intermediate -> extract -> Destination) 
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('df')) & (df[f'type'] == 'EXTRACT'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level']].to_dict('records')
sub_dict

[{'start_id': 'if_186',
  'end_id': 'df_96',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_228',
  'end_id': 'df_38',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_256',
  'end_id': 'df_75',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_261',
  'end_id': 'df_93',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_257',
  'end_id': 'df_76',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_273',
  'end_id': 'df_134',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_229',
  'end_id': 'df_39',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_227',
  'end_id': 'df_30',
  

In [13]:
# (n:FIELD)-[r:EXTRACT]-(n:FIELD) directly extract
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:DestinationField {destfieldId: end_Id})
    MERGE(n1)-[r:EXTRACT_TO {stored_procedure: stored_procedure, cte_level: cte_level}]->(n2)
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,69


In [14]:
#ingest extract field (case Intermediate -> extract -> Intermediate) 
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('if')) & (df[f'type'] == 'EXTRACT'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level']].to_dict('records')
sub_dict

[{'start_id': 'if_42',
  'end_id': 'if_228',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_75',
  'end_id': 'if_256',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_35',
  'end_id': 'if_261',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_83',
  'end_id': 'if_257',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_89',
  'end_id': 'if_273',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_39',
  'end_id': 'if_229',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_47',
  'end_id': 'if_227',
  'type': 'EXTRACT',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_59',
  'end_id': 'if_24

In [15]:
# (n:FIELD)-[r:EXTRACT]-(n:FIELD) directly extract
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:IntermediateField {intfieldId: end_Id})
    MERGE(n1)-[r:EXTRACT_TO {stored_procedure: stored_procedure, cte_level: cte_level}]->(n2)
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,74


In [16]:
#ingest extract field Case inter -> calculated -> destination
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('df')) & (df[f'type'] == 'CALCULATED'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level']].to_dict('records')
sub_dict

[{'start_id': 'if_260',
  'end_id': 'df_92',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_255',
  'end_id': 'df_83',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_207',
  'end_id': 'df_83',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_146',
  'end_id': 'df_83',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_209',
  'end_id': 'df_84',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_208',
  'end_id': 'df_84',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_255',
  'end_id': 'df_121',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_121',
  

In [17]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) involved calculation
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:DestinationField {destfieldId: end_Id})
    MERGE(n1)-[r:CALCULATED_TO {stored_procedure: stored_procedure, cte_level: cte_level}]->(n2)
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,111


In [18]:
#ingest extract field Case inter -> calculated -> inter
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('if')) & (df[f'type'] == 'CALCULATED'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level']].to_dict('records')
sub_dict

[{'start_id': 'if_98',
  'end_id': 'if_260',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_82',
  'end_id': 'if_218',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_71',
  'end_id': 'if_254',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_72',
  'end_id': 'if_253',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_74',
  'end_id': 'if_264',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_53',
  'end_id': 'if_232',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_55',
  'end_id': 'if_236',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_62

In [19]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) involved calculation
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:IntermediateField {intfieldId: end_Id})
    MERGE(n1)-[r:CALCULATED_TO {stored_procedure: stored_procedure, cte_level: cte_level}]->(n2)
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,82


In [20]:
#ingest extract field Case destination-> calculated -> destination
sub_dict = df.loc[(df[f'start_id'].str.contains('df')) & (df[f'end_id'].str.contains('df')) & (df[f'type'] == 'CALCULATED'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level']].to_dict('records')
sub_dict

[{'start_id': 'df_75',
  'end_id': 'df_75',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'df_93',
  'end_id': 'df_75',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'df_72',
  'end_id': 'df_75',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'df_56',
  'end_id': 'df_56',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'df_72',
  'end_id': 'df_56',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'df_50',
  'end_id': 'df_50',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'df_57',
  'end_id': 'df_57',
  'type': 'CALCULATED',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'df_72',
  'end_id':

In [21]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) involved calculation
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level
    MATCH(n1:DestinationField {destfieldId: start_Id})
    MATCH(n2:DestinationField {destfieldId: end_Id})
    MERGE(n1)-[r:CALCULATED_TO {stored_procedure: stored_procedure, cte_level: cte_level}]->(n2)
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,49


In [22]:
#ingest extract field inter -> join -> dest
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('df')) & (df[f'type'] == 'JOIN'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level']].to_dict('records')
sub_dict

[{'start_id': 'if_168',
  'end_id': 'df_75',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_42',
  'end_id': 'df_75',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_168',
  'end_id': 'df_56',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_42',
  'end_id': 'df_56',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_213',
  'end_id': 'df_84',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_210',
  'end_id': 'df_84',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_213',
  'end_id': 'df_85',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first'},
 {'start_id': 'if_201',
  'end_id': 'df_85',
  'type': 'JOIN',
  'store

In [23]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) join from
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:DestinationField {destfieldId: end_Id})
    MERGE(n2)-[r:JOIN_FROM {stored_procedure: stored_procedure, cte_level: cte_level}]->(n1)
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,72


In [24]:
#ingest extract field destination -> join -> inter
sub_dict = df.loc[(df[f'start_id'].str.contains('df')) & (df[f'end_id'].str.contains('if')) & (df[f'type'] == 'JOIN'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level']].to_dict('records')
sub_dict

[{'start_id': 'df_38',
  'end_id': 'if_196',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'df_38',
  'end_id': 'if_196',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'}]

In [25]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) join from
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level
    MATCH(n1:DestinationField {destfieldId: start_Id})
    MATCH(n2:IntermediateField {intfieldId: end_Id})
    MERGE(n2)-[r:JOIN_FROM {stored_procedure: stored_procedure, cte_level: cte_level}]->(n1)
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,2


In [26]:
#ingest extract field destination -> join -> inter
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('if')) & (df[f'type'] == 'JOIN'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level']].to_dict('records')
sub_dict

[{'start_id': 'if_237',
  'end_id': 'if_207',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_10',
  'end_id': 'if_207',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_255',
  'end_id': 'if_207',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_06',
  'end_id': 'if_207',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_09',
  'end_id': 'if_207',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_237',
  'end_id': 'if_209',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_10',
  'end_id': 'if_209',
  'type': 'JOIN',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second'},
 {'start_id': 'if_255',
  'end_id': 'if_209',
  'type': 'JO

In [27]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) join from
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:IntermediateField {intfieldId: end_Id})
    MERGE(n2)-[r:JOIN_FROM {stored_procedure: stored_procedure, cte_level: cte_level}]->(n1)
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,48


In [28]:
#ingest extract field dest -> join -> dest
sub_dict = df.loc[(df[f'start_id'].str.contains('df')) & (df[f'end_id'].str.contains('df')) & (df[f'type'] == 'JOIN'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level']].to_dict('records')

# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) join from
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level
    MATCH(n1:DestinationField {destfieldId: start_Id})
    MATCH(n2:DestinationField {destfieldId: end_Id})
    MERGE(n2)-[r:JOIN_FROM {stored_procedure: stored_procedure, cte_level: cte_level}]->(n1)
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,18
