# Synapse Pipeline Helper

In [1]:
import json

## 1. Ingest Existing Metadata

In [31]:
fact_fields = [
    "uid",
    "fin_year",
    "record_type",
    "State",
    "state_name",
    "housing_places",
    "other_depts",
    "rev_dva",
    "rev_recoveries",
    "rev_state_health",
    "rev_cwlth_other",
    "rev_patients",
    "rev_other",
    "rev_state_other",
    "res_exp_academic",
    "res_exp_training",
    "res_exp_insurance",
    "res_exp_mh_act",
    "res_exp_promotion",
    "res_exp_research",
    "res_exp_transport",
    "res_exp_program_admin",
    "res_exp_property_lease",
    "res_exp_service_dev",
    "res_exp_super",
    "res_exp_workers_comp",
    "res_exp_other",
    "is_complete"
]

print(fact_fields)


['uid', 'fin_year', 'record_type', 'State', 'state_name', 'housing_places', 'other_depts', 'rev_dva', 'rev_recoveries', 'rev_state_health', 'rev_cwlth_other', 'rev_patients', 'rev_other', 'rev_state_other', 'res_exp_academic', 'res_exp_training', 'res_exp_insurance', 'res_exp_mh_act', 'res_exp_promotion', 'res_exp_research', 'res_exp_transport', 'res_exp_program_admin', 'res_exp_property_lease', 'res_exp_service_dev', 'res_exp_super', 'res_exp_workers_comp', 'res_exp_other', 'is_complete']


In [32]:
submission_fields = [
    "username",
    "fin_year",
    "RecType",
    "State",
    "StateName",
    "NHousePlaces",
    "NgoOtherDepts",
    "RevDVA",
    "RevRecov",
    "RevStateHealth",
    "RevCwlthOther",
    "RevPatients",
    "RevOther",
    "RevStateOther",
    "ExpNerAcademic",
    "ExpNerTraining",
    "ExpNerInsur",
    "ExpNerMHAct",
    "ExpNerPromo",
    "ExpNerResearch",
    "ExpNerTransp",
    "ExpNerProgAdmin",
    "ExpNerPropLease",
    "ExpNerServDev",
    "ExpNerSuper",
    "ExpNerWorkComp",
    "ExpNerOther",
    "considered_complete"
]

print(submission_fields)


['username', 'fin_year', 'RecType', 'State', 'StateName', 'NHousePlaces', 'NgoOtherDepts', 'RevDVA', 'RevRecov', 'RevStateHealth', 'RevCwlthOther', 'RevPatients', 'RevOther', 'RevStateOther', 'ExpNerAcademic', 'ExpNerTraining', 'ExpNerInsur', 'ExpNerMHAct', 'ExpNerPromo', 'ExpNerResearch', 'ExpNerTransp', 'ExpNerProgAdmin', 'ExpNerPropLease', 'ExpNerServDev', 'ExpNerSuper', 'ExpNerWorkComp', 'ExpNerOther', 'considered_complete']


## 2. Create Mapping 

### 2.1 Between Submission fields and Fact fields

In [17]:
repo_dir               = r"C:\Users\he157349\source\repos\Synapse-Analytics-workspace"

fact_schema_dir        = r"\master_and_reference\schema\a1_mhe\fact_schemas"
submission_schema_dir  = r"\master_and_reference\schema\a1_mhe"

fact_schema_filename       = "fact_state.json"
submission_schema_filename = "mhe_submission_state.json"

In [18]:
fact_schema_file       = repo_dir + fact_schema_dir + "\\" + fact_schema_filename
submission_schema_file = repo_dir + submission_schema_dir + "\\" + submission_schema_filename

### 2.2 Load data from schema files

In [19]:
import json

with open(fact_schema_file, 'r') as f:
    fact_schema_data = json.load(f)

with open(submission_schema_file, 'r') as f:
    submission_schema_data = json.load(f)

### 2.3 Create mapping

In [None]:
mapping = {}
for fact in fact_fields:
    for submission in submission_fields:
        mapping[fact['name']] = submission['name']

print(mapping)

{'fin_year': 'fin_year', 'State': 'State'}


In [33]:
mapping_list = []
for fact, submission in zip(fact_fields, submission_fields):
    mapping_list.append({"from": submission, "to": fact})

print(mapping_list)

[{'from': 'username', 'to': 'uid'}, {'from': 'fin_year', 'to': 'fin_year'}, {'from': 'RecType', 'to': 'record_type'}, {'from': 'State', 'to': 'State'}, {'from': 'StateName', 'to': 'state_name'}, {'from': 'NHousePlaces', 'to': 'housing_places'}, {'from': 'NgoOtherDepts', 'to': 'other_depts'}, {'from': 'RevDVA', 'to': 'rev_dva'}, {'from': 'RevRecov', 'to': 'rev_recoveries'}, {'from': 'RevStateHealth', 'to': 'rev_state_health'}, {'from': 'RevCwlthOther', 'to': 'rev_cwlth_other'}, {'from': 'RevPatients', 'to': 'rev_patients'}, {'from': 'RevOther', 'to': 'rev_other'}, {'from': 'RevStateOther', 'to': 'rev_state_other'}, {'from': 'ExpNerAcademic', 'to': 'res_exp_academic'}, {'from': 'ExpNerTraining', 'to': 'res_exp_training'}, {'from': 'ExpNerInsur', 'to': 'res_exp_insurance'}, {'from': 'ExpNerMHAct', 'to': 'res_exp_mh_act'}, {'from': 'ExpNerPromo', 'to': 'res_exp_promotion'}, {'from': 'ExpNerResearch', 'to': 'res_exp_research'}, {'from': 'ExpNerTransp', 'to': 'res_exp_transport'}, {'from': '

In [None]:
print(json.dumps(mapping_list, indent=2))

In [46]:
max_from_length = max(len(item["from"]) for item in mapping_list)
max_to_length   = max(len(item["to"])   for item in mapping_list)


for item in mapping_list:
    print(f',{{ "from": "{item["from"]}", {" " * (max_from_length - len(item["from"]))} "to": "{item["to"]}" {" " * (max_from_length + 3 - len(item["to"]))} }}')

,{ "from": "username",             "to": "uid"                     }
,{ "from": "fin_year",             "to": "fin_year"                }
,{ "from": "RecType",              "to": "record_type"             }
,{ "from": "State",                "to": "State"                   }
,{ "from": "StateName",            "to": "state_name"              }
,{ "from": "NHousePlaces",         "to": "housing_places"          }
,{ "from": "NgoOtherDepts",        "to": "other_depts"             }
,{ "from": "RevDVA",               "to": "rev_dva"                 }
,{ "from": "RevRecov",             "to": "rev_recoveries"          }
,{ "from": "RevStateHealth",       "to": "rev_state_health"        }
,{ "from": "RevCwlthOther",        "to": "rev_cwlth_other"         }
,{ "from": "RevPatients",          "to": "rev_patients"            }
,{ "from": "RevOther",             "to": "rev_other"               }
,{ "from": "RevStateOther",        "to": "rev_state_other"         }
,{ "from": "ExpNerAcademic",      