In [1]:
import pandas as pd

## Connect ot Graph Data Science

In [2]:
from graphdatascience import GraphDataScience

# Use Neo4j URI and credentials according to your setup
gds = GraphDataScience('neo4j://localhost', auth=('neo4j', 'neopharm'))

## Staging for ETL
1. Clear the graph of any existing data and indexes
2. read the source data
3. create helper functions
4. Set Neo4j Indexes


In [3]:
# Clear last graph - All data and schema attributes
gds.run_cypher('MATCH(n) DETACH DELETE n')
gds.run_cypher('CALL apoc.schema.assert({},{})')

Unnamed: 0,label,key,keys,unique,action
0,EXTRACT_TO,pathIndex,[pathIndex],False,DROPPED
1,CALCULATED_TO,pathIndex,[pathIndex],False,DROPPED
2,JOIN_FROM,pathIndex,[pathIndex],False,DROPPED
3,Table,tableId,[tableId],True,DROPPED
4,Schema,schema_Id,[schema_Id],True,DROPPED
5,Subquery,subqueryId,[subqueryId],True,DROPPED
6,StoredProcedure,SP_Id,[SP_Id],True,DROPPED
7,Intermediate_Field,intId,[intId],True,DROPPED
8,DestinationField,destId,[destId],True,DROPPED
9,DestinationField,destfieldId,[destfieldId],True,DROPPED


In [4]:
# Create Indexes for table
gds.run_cypher('CREATE CONSTRAINT table_unique IF NOT EXISTS FOR (n:Table) REQUIRE n.tableId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT destination_field_unique IF NOT EXISTS FOR (n:DestinationField) REQUIRE n.destId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT intermediate_field_unique IF NOT EXISTS FOR (n:IntermediateField) REQUIRE n.intId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT schema_unique IF NOT EXISTS FOR (n:Schema) REQUIRE n.schema_Id  IS UNIQUE')

gds.run_cypher('CREATE CONSTRAINT dest_field_unique IF NOT EXISTS FOR (n:DestinationField) REQUIRE n.destfieldId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT int_field_unique IF NOT EXISTS FOR (n:IntermediateField) REQUIRE n.intfieldId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT subquery_unique IF NOT EXISTS FOR (n:Subquery) REQUIRE n.subqueryId  IS UNIQUE')
gds.run_cypher('CREATE CONSTRAINT sp_unique IF NOT EXISTS FOR (n:StoredProcedure) REQUIRE n.SP_Id  IS UNIQUE')

gds.run_cypher('CREATE INDEX extract_path IF NOT EXISTS FOR ()-[r:EXTRACT_TO]-() ON (r.pathIndex)')
gds.run_cypher('CREATE INDEX calculated_path IF NOT EXISTS FOR ()-[r:CALCULATED_TO ]-() ON (r.pathIndex)')
gds.run_cypher('CREATE INDEX join_path IF NOT EXISTS FOR ()-[r:JOIN_FROM ]-() ON (r.pathIndex)')

In [5]:
df = pd.read_csv('destination_field_2_0.csv', dtype=str)
df

Unnamed: 0,table,fieldID,field_name,table.1,table_id,schema,Storedprocedured
0,otp_po_cut_level,df_001,country_of_origin,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
1,otp_po_cut_level,df_002,po_cut,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
2,otp_po_cut_level,df_003,style,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
3,otp_po_cut_level,df_004,color,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
4,otp_po_cut_level,df_005,style_description,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
...,...,...,...,...,...,...,...
130,otp_po_cut_level,df_131,mdg_vendor_purchasing_block,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
131,otp_po_cut_level,df_132,mdg_factory_created_on,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
132,otp_po_cut_level,df_133,mdg_factory_purchasing_block,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level
133,otp_po_cut_level,df_134,fob_price,otp_po_cut_level,Table_01,qma_datamart,sp_otp_po_cut_level


In [6]:
sub_dict = df.loc[(df[f'fieldID'] != '?') & (df[f'field_name'] != '?') & (df[f'fieldID'].notna()),
                  ['fieldID', 'field_name', 'table', 'table_id', 'schema', 'Storedprocedured']].to_dict('records')
sub_dict

[{'fieldID': 'df_001',
  'field_name': 'country_of_origin',
  'table': 'otp_po_cut_level',
  'table_id': 'Table_01',
  'schema': 'qma_datamart',
  'Storedprocedured': 'sp_otp_po_cut_level'},
 {'fieldID': 'df_002',
  'field_name': 'po_cut',
  'table': 'otp_po_cut_level',
  'table_id': 'Table_01',
  'schema': 'qma_datamart',
  'Storedprocedured': 'sp_otp_po_cut_level'},
 {'fieldID': 'df_003',
  'field_name': 'style',
  'table': 'otp_po_cut_level',
  'table_id': 'Table_01',
  'schema': 'qma_datamart',
  'Storedprocedured': 'sp_otp_po_cut_level'},
 {'fieldID': 'df_004',
  'field_name': 'color',
  'table': 'otp_po_cut_level',
  'table_id': 'Table_01',
  'schema': 'qma_datamart',
  'Storedprocedured': 'sp_otp_po_cut_level'},
 {'fieldID': 'df_005',
  'field_name': 'style_description',
  'table': 'otp_po_cut_level',
  'table_id': 'Table_01',
  'schema': 'qma_datamart',
  'Storedprocedured': 'sp_otp_po_cut_level'},
 {'fieldID': 'df_006',
  'field_name': 'goods_description',
  'table': 'otp_po_c

In [7]:
# Load nodes and draw location at relationships
gds.run_cypher('''
    UNWIND $dest_fields AS a
    WITH a.fieldID AS Dest_fieldID, 
         a.field_name AS Dest_field_name, 
         a.table AS table_name,
         a.table_id AS table_ID, 
         a.schema AS schema,
         a.Storedprocedured AS stored_procedure
    MERGE(n0:Table {tableId: table_ID}) SET n0.table_name=table_name
    MERGE(n1:DestinationField {destfieldId: Dest_fieldID}) SET n1.Dest_field_name=Dest_field_name
    MERGE(n2:StoredProcedure {SP_Id: stored_procedure}) SET n2.stored_procedure=stored_procedure
    MERGE(n3:Schema {schema_Id: schema}) SET n3.schema=schema
    
    MERGE(n0)<-[:LOCATED_AT]-(n1)
    MERGE(n2)<-[:CREATED_IN]-(n1)
    MERGE(n3)<-[:INSIDE_OF]-(n0)
   
    RETURN count(n0), count(n1), count(n2), count(n3)
    ''', params={'dest_fields':sub_dict})

Unnamed: 0,count(n0),count(n1),count(n2),count(n3)
0,135,135,135,135


In [8]:
df = pd.read_csv('intermediate_field_2_0.csv', dtype=str)
df

Unnamed: 0,field_id,field_name,table,table_id,schema
0,if_001,avg_duty_fees_rt,#fob_duties_rate,Table_02,temp
1,if_002,avg_duty_fees_rt,#fob_duties_rate,Table_02,temp
2,if_003,article,#standard_duty_rate_article,Table_03,temp
3,if_004,main_duty_rate,#standard_duty_rate_article,Table_03,temp
4,if_005,name,#standard_duty_rate_article,Table_03,temp
...,...,...,...,...,...
287,if_288,bu,z2,Table_46,subquery
288,if_289,major_product_category,z2,Table_46,subquery
289,if_290,market,z2,Table_46,subquery
290,if_291,product_supply_group,z2,Table_46,subquery


In [9]:
sub_dict = df.loc[(df[f'field_id'] != '?') & (df[f'field_name'] != '?') & (df[f'field_id'].notna()),
                  ['field_id', 'field_name', 'table', 'table_id', 'schema']].to_dict('records')
sub_dict

[{'field_id': 'if_001',
  'field_name': 'avg_duty_fees_rt',
  'table': '#fob_duties_rate',
  'table_id': 'Table_02',
  'schema': 'temp'},
 {'field_id': 'if_002',
  'field_name': 'avg_duty_fees_rt',
  'table': '#fob_duties_rate',
  'table_id': 'Table_02',
  'schema': 'temp'},
 {'field_id': 'if_003',
  'field_name': 'article',
  'table': '#standard_duty_rate_article',
  'table_id': 'Table_03',
  'schema': 'temp'},
 {'field_id': 'if_004',
  'field_name': 'main_duty_rate',
  'table': '#standard_duty_rate_article',
  'table_id': 'Table_03',
  'schema': 'temp'},
 {'field_id': 'if_005',
  'field_name': 'name',
  'table': '#standard_duty_rate_article',
  'table_id': 'Table_03',
  'schema': 'temp'},
 {'field_id': 'if_006',
  'field_name': 'hts_code',
  'table': '#standard_duty_rate_hts_code',
  'table_id': 'Table_04',
  'schema': 'temp'},
 {'field_id': 'if_007',
  'field_name': 'main_duty_rate',
  'table': '#standard_duty_rate_hts_code',
  'table_id': 'Table_04',
  'schema': 'temp'},
 {'field_i

In [10]:
# Load nodes and draw location at relationships
gds.run_cypher('''
    UNWIND $int_fields AS a
    WITH a.field_id AS int_fieldID, 
         a.field_name AS int_field_name, 
         a.table AS table_name,
         a.table_id AS table_ID, 
         a.schema AS schema
    MERGE(n0:Table {tableId: table_ID}) SET n0.table_name=table_name
    MERGE(n1:IntermediateField {intfieldId: int_fieldID}) SET n1.int_field_name=int_field_name
    MERGE(n2:Schema {schema_Id: schema}) SET n2.schema=schema
    
    MERGE(n0)<-[:LOCATED_AT]-(n1)
    MERGE(n2)<-[:INSIDE_OF]-(n0)
   
    RETURN count(n0), count(n1), count(n2)
    ''', params={'int_fields':sub_dict})

Unnamed: 0,count(n0),count(n1),count(n2)
0,292,292,292


## Ingest Relationships
Relationships will consist of the freight forwarding steps as well as links representing connection between transportation segments and transfer between the inbound and output stage of the shipment

In [11]:
df = pd.read_csv('relationship_2_0.csv', dtype=str)
df['path_index'] = df['path_index'].str.lower()
df

Unnamed: 0,:START_POSITION,start_id,:END_POSITION,end_id,type,stored_procedure,cte_level,path_index
0,sp_country_master \\ name,if_201,otp_po_cut_level \\ country_of_origin,df_001,extract,sp_otp_po_cut_level,first,country_of_origin
1,z \\ po_cut,if_255,otp_po_cut_level \\ po_cut,df_002,extract,sp_otp_po_cut_level,first,po_cut
2,z \\ style,if_282,otp_po_cut_level \\ style,df_003,extract,sp_otp_po_cut_level,first,style
3,otp_po_cut_level \\ style,df_003,otp_po_cut_level \\ style,df_003,calculated,sp_otp_po_cut_level,first,style
4,otp_po_cut_level \\ color,df_004,otp_po_cut_level \\ style,df_003,calculated,sp_otp_po_cut_level,first,style
...,...,...,...,...,...,...,...,...
648,otp_po_cut_level \\ market,df_082,#fob_duties_rate \\ avg_duty_fees_rt,if_001,join,sp_otp_po_cut_level,third,fob_duties_rate
649,otp_po_cut_level \\ hts_code,df_051,#fob_duties_rate \\ avg_duty_fees_rt,if_001,join,sp_otp_po_cut_level,third,fob_duties_rate
650,otp_po_cut_level \\ country_of_origin,df_001,#fob_duties_rate \\ avg_duty_fees_rt,if_001,join,sp_otp_po_cut_level,third,fob_duties_rate
651,otp_po_cut_level \\ destination_country,df_105,#fob_duties_rate \\ avg_duty_fees_rt,if_001,join,sp_otp_po_cut_level,third,fob_duties_rate


In [12]:
#ingest extract field (case Intermediate -> extract -> Destination) 
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('df')) & (df[f'type'] == 'extract'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level', 'path_index']].to_dict('records')
sub_dict

[{'start_id': 'if_201',
  'end_id': 'df_001',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'country_of_origin'},
 {'start_id': 'if_255',
  'end_id': 'df_002',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'po_cut'},
 {'start_id': 'if_282',
  'end_id': 'df_003',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'style'},
 {'start_id': 'if_228',
  'end_id': 'df_004',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'color'},
 {'start_id': 'if_283',
  'end_id': 'df_005',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'style_description'},
 {'start_id': 'if_240',
  'end_id': 'df_006',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'goods_descri

In [13]:
# (n:FIELD)-[r:EXTRACT]-(n:FIELD) directly extract
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level,
        relMap.path_index AS path_index
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:DestinationField {destfieldId: end_Id})
    MERGE(n1)-[r:EXTRACT_TO {stored_procedure: stored_procedure, cte_level: cte_level}]->(n2)
    ON CREATE SET r.pathIndex = path_index
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,97


In [14]:
#ingest extract field (case Intermediate -> extract -> Intermediate) 
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('if')) & (df[f'type'] == 'extract'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level', 'path_index']].to_dict('records')
sub_dict

[{'start_id': 'if_063',
  'end_id': 'if_255',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'po_cut'},
 {'start_id': 'if_115',
  'end_id': 'if_282',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'style'},
 {'start_id': 'if_058',
  'end_id': 'if_228',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'color'},
 {'start_id': 'if_074',
  'end_id': 'if_283',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'style_description'},
 {'start_id': 'if_080',
  'end_id': 'if_240',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'goods_description'},
 {'start_id': 'if_083',
  'end_id': 'if_256',
  'type': 'extract',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'po_iss

In [15]:
# (n:FIELD)-[r:EXTRACT]-(n:FIELD) directly extract
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level,
        relMap.path_index AS path_index
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:IntermediateField {intfieldId: end_Id})
    MERGE(n1)-[r:EXTRACT_TO {stored_procedure: stored_procedure, cte_level: cte_level}]->(n2)
    ON CREATE SET r.pathIndex = path_index
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,96


In [16]:
#ingest extract field Case inter -> calculated -> destination
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('df')) & (df[f'type'] == 'calculated'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level', 'path_index']].to_dict('records')
sub_dict

[{'start_id': 'if_226',
  'end_id': 'df_010',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'actual_crd_at_origin'},
 {'start_id': 'if_281',
  'end_id': 'df_022',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'vendor_ffc'},
 {'start_id': 'if_221',
  'end_id': 'df_022',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'vendor_ffc'},
 {'start_id': 'if_221',
  'end_id': 'df_022',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'vendor_ffc'},
 {'start_id': 'if_221',
  'end_id': 'df_022',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'vendor_ffc'},
 {'start_id': 'if_221',
  'end_id': 'df_022',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
 

In [17]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) involved calculation
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level,
        relMap.path_index AS path_index
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:DestinationField {destfieldId: end_Id})
    MERGE(n1)-[r:CALCULATED_TO {stored_procedure: stored_procedure, cte_level: cte_level}]->(n2)
    ON CREATE SET r.pathIndex = path_index
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,145


In [18]:
#ingest extract field Case inter -> calculated -> inter
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('if')) & (df[f'type'] == 'calculated'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level', 'path_index']].to_dict('records')
sub_dict

[{'start_id': 'if_118',
  'end_id': 'if_226',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'actual_crd_at_origin'},
 {'start_id': 'if_073',
  'end_id': 'if_245',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'local_currency'},
 {'start_id': 'if_111',
  'end_id': 'if_280',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'sourcing_office'},
 {'start_id': 'if_112',
  'end_id': 'if_279',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'source_system'},
 {'start_id': 'if_114',
  'end_id': 'if_231',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'dc_code'},
 {'start_id': 'if_094',
  'end_id': 'if_259',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_leve

In [19]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) involved calculation
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level,
        relMap.path_index AS path_index
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:IntermediateField {intfieldId: end_Id})
    MERGE(n1)-[r:CALCULATED_TO {stored_procedure: stored_procedure, cte_level: cte_level}]->(n2)
    ON CREATE SET r.pathIndex = path_index
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,84


In [20]:
#ingest extract field Case destination-> calculated -> destination
sub_dict = df.loc[(df[f'start_id'].str.contains('df')) & (df[f'end_id'].str.contains('df')) & (df[f'type'] == 'calculated'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level', 'path_index']].to_dict('records')
sub_dict

[{'start_id': 'df_003',
  'end_id': 'df_003',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'style'},
 {'start_id': 'df_004',
  'end_id': 'df_003',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'style'},
 {'start_id': 'df_041',
  'end_id': 'df_003',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'style'},
 {'start_id': 'df_012',
  'end_id': 'df_012',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'season'},
 {'start_id': 'df_041',
  'end_id': 'df_012',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'season'},
 {'start_id': 'df_030',
  'end_id': 'df_030',
  'type': 'calculated',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'report_order_qty_

In [21]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) involved calculation
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level,
        relMap.path_index AS path_index
    MATCH(n1:DestinationField {destfieldId: start_Id})
    MATCH(n2:DestinationField {destfieldId: end_Id})
    MERGE(n1)-[r:CALCULATED_TO {stored_procedure: stored_procedure, cte_level: cte_level}]->(n2)
    ON CREATE SET r.pathIndex = path_index
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,82


In [22]:
#ingest extract field inter -> join -> dest
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('df')) & (df[f'type'] == 'join'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level', 'path_index']].to_dict('records')
sub_dict

[{'start_id': 'if_063',
  'end_id': 'df_003',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'style'},
 {'start_id': 'if_063',
  'end_id': 'df_012',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'season'},
 {'start_id': 'if_214',
  'end_id': 'df_023',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'vendor_group_name'},
 {'start_id': 'if_224',
  'end_id': 'df_023',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'vendor_group_name'},
 {'start_id': 'if_214',
  'end_id': 'df_028',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'vendor_name'},
 {'start_id': 'if_215',
  'end_id': 'df_028',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'first',
  'path_index': 'vendor_name'},
 {'start_

In [23]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) join from
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level,
        relMap.path_index AS path_index
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:DestinationField {destfieldId: end_Id})
    MERGE(n2)-[r:JOIN_FROM {stored_procedure: stored_procedure, cte_level: cte_level}]->(n1)
    ON CREATE SET r.pathIndex = path_index
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,54


In [24]:
#ingest extract field destination -> join -> inter
sub_dict = df.loc[(df[f'start_id'].str.contains('df')) & (df[f'end_id'].str.contains('if')) & (df[f'type'] == 'join'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level', 'path_index']].to_dict('records')
sub_dict

[{'start_id': 'df_002',
  'end_id': 'if_207',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'certified_pocut_flag'},
 {'start_id': 'df_002',
  'end_id': 'if_207',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'certprintdate'},
 {'start_id': 'df_082',
  'end_id': 'if_001',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'third',
  'path_index': 'fob_duties_rate'},
 {'start_id': 'df_051',
  'end_id': 'if_001',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'third',
  'path_index': 'fob_duties_rate'},
 {'start_id': 'df_001',
  'end_id': 'if_001',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'third',
  'path_index': 'fob_duties_rate'},
 {'start_id': 'df_105',
  'end_id': 'if_001',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'third',
  'path_index': '

In [25]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) join from
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level,
        relMap.path_index AS path_index
    MATCH(n1:DestinationField {destfieldId: start_Id})
    MATCH(n2:IntermediateField {intfieldId: end_Id})
    MERGE(n2)-[r:JOIN_FROM {stored_procedure: stored_procedure, cte_level: cte_level}]->(n1)
    ON CREATE SET r.pathIndex = path_index
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,7


In [26]:
#ingest extract field destination -> join -> inter
sub_dict = df.loc[(df[f'start_id'].str.contains('if')) & (df[f'end_id'].str.contains('if')) & (df[f'type'] == 'join'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level', 'path_index']].to_dict('records')
sub_dict

[{'start_id': 'if_263',
  'end_id': 'if_221',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'vendor_ffc'},
 {'start_id': 'if_032',
  'end_id': 'if_221',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'vendor_ffc'},
 {'start_id': 'if_281',
  'end_id': 'if_221',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'vendor_ffc'},
 {'start_id': 'if_033',
  'end_id': 'if_221',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'vendor_ffc'},
 {'start_id': 'if_031',
  'end_id': 'if_221',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'vendor_ffc'},
 {'start_id': 'if_263',
  'end_id': 'if_223',
  'type': 'join',
  'stored_procedure': 'sp_otp_po_cut_level',
  'cte_level': 'second',
  'path_index': 'vendor_group_name'},
 {'

In [27]:
# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) join from
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level,
        relMap.path_index AS path_index
    MATCH(n1:IntermediateField {intfieldId: start_Id})
    MATCH(n2:IntermediateField {intfieldId: end_Id})
    MERGE(n2)-[r:JOIN_FROM {stored_procedure: stored_procedure, cte_level: cte_level}]->(n1)
    ON CREATE SET r.pathIndex = path_index
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,43


In [28]:
#ingest extract field dest -> join -> dest
sub_dict = df.loc[(df[f'start_id'].str.contains('df')) & (df[f'end_id'].str.contains('df')) & (df[f'type'] == 'join'),
                  ['start_id', 'end_id','type', 'stored_procedure','cte_level', 'path_index']].to_dict('records')

# (n:FIELD)-[r:CALCULATED_TO]-(n:FIELD) join from
gds.run_cypher('''
    UNWIND $relMaps AS relMap
    WITH relMap.start_id AS start_Id,
        relMap.end_id AS end_Id,
        relMap.stored_procedure AS stored_procedure,
        relMap.cte_level AS cte_level,
        relMap.path_index AS path_index
    MATCH(n1:DestinationField {destfieldId: start_Id})
    MATCH(n2:DestinationField {destfieldId: end_Id})
    MERGE(n2)-[r:JOIN_FROM {stored_procedure: stored_procedure, cte_level: cte_level}]->(n1)
    ON CREATE SET r.pathIndex = path_index
    RETURN count(r)
''', params={'relMaps':sub_dict})

Unnamed: 0,count(r)
0,45
