In [2]:
import asci_aap_data_extractor
import edges_kumu_extractor
import sys
import os
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)
import SQLConnect

asci_aap_data = asci_aap_data_extractor.extract()
edges = edges_kumu_extractor.extract()
Unmerged = SQLConnect.UnmergedV1


Extracting from asci_aap_data.csv
Found column names

Extracting from edges_kumu.csv
Found column names
{'Parent Institution-Child Institution', 'Specialization-Associate', 'Year-Associate', 'Parent Specialization-Child Specialization', 'Institution-Associate'}


In [None]:
# The data to be uploaded
people = set()
org = set()
bioentity = set()
people_org_name_dic = {}
people_spec_name = set()
bio_relation_name = set()
org_relation_name = set()

In [None]:
# Extracting data from edges_kumu.csv
for record in edges:
    if record["type"] == "Parent Specialization-Child Specialization":
        bioentity.add(("ASCI+AAP", record["entity1"]))
        bioentity.add(("ASCI+AAP", record["entity2"]))
        bio_relation_name.add((record["entity1"], record["entity2"], "parent"))
        bio_relation_name.add((record["entity2"], record["entity1"], "child"))
    elif record["type"] == "Specialization-Associate":
        bioentity.add(("ASCI+AAP", record["entity1"]))
        people_spec_name.add((record["entity1"], record["entity2"]))
    elif record["type"] == "Parent Institution-Child Institution":
        org_relation_name.add((record["entity1"], record["entity2"], "parent"))
        org_relation_name.add((record["entity2"], record["entity1"], "child"))
    elif record["type"] == "Institution-Associate":
        if record["entity2"] in people_org_name_dic:
            people_org_name_dic[record["entity2"]].append(record["entity1"])
        else:
            people_org_name_dic[record["entity2"]] = [record["entity1"]]
    else:
        if record["entity2"] in people_org_name_dic:
            people_org_name_dic[record["entity2"]] = [int(record["entity1"])] + people_org_name_dic[record["entity2"]]
        else:
            people_org_name_dic[record["entity2"]] = [int(record["entity1"])]

In [None]:
# Extracting data from asci_aap_data.csv
for record in asci_aap_data:
    new_person = ("ASCI+AAP", record["email"], record["phone"], None, record["first_name"], record["middle_name"], record["last_name"], None)
    people.add(new_person)
    for aff in record["affiliation"] + record["umbrella_spec"] + record["related_aff"]:
        new_org = ("ASCI+AAP", aff, None)
        org.add(new_org)

In [None]:
# Converting people affiliation data to a set type
people_org_name_set = set()
for key, val in people_org_name_dic.items():
    if type(val[0]) == int:
        for aff in val[1:]:
            people_org_name_set.add((key, aff, val[0]))
    else:
        for aff in val:
            people_org_name_set.add((key, aff, None))

In [None]:
# Uploading people, organization, and bioentity data
people_queries = [Unmerged.people(record) for record in people]
org_queries = [Unmerged.org(record) for record in org]
bioentity_queries = [Unmerged.bioentity(record) for record in bioentity]
primary_queries = people_queries + org_queries + bioentity_queries
query_types = ["INSERT" for _ in range(len(primary_queries))]
SQLConnect.connect_and_query(queries=primary_queries, types=query_types, database="Onboarding")

In [None]:
# Finding the Primary IDs for the people, organizations, and bio
people_id_raw = SQLConnect.connect_and_query(queries=["SELECT people_id, first_name, middle_name, last_name FROM People;"], types=["SELECT"], database="UnmergedV1")
org_id_raw = SQLConnect.connect_and_query(queries=["SELECT org_id, name FROM Org;"], types=["SELECT"], database="UnmergedV1")
bio_id_raw = SQLConnect.connect_and_query(queries=["SELECT bio_id, name FROM Bioentity;"], types=["SELECT"], database="UnmergedV1")

In [None]:
# Clean primary id data
people_id_clean = {}
org_id_clean = {}
bio_id_clean = {}
for record in people_id_raw[0]:
    name = record[3] + ", " + record[1]
    if record[2]:
        name += (" (" + record[2] + ")")
    people_id_clean[name] = record[0]
for record in org_id_raw[0]:
    org_id_clean[record[1]] = record[0]
for record in bio_id_raw[0]:
    bio_id_clean[record[1]] = record[0]

In [None]:
# Convert people affiliation, org relation, bio relation, and people spec into id types
people_org_id = set()
org_relation_id = set()
bio_relation_id = set()
people_spec_id = set()
manual_upload = []
for record in people_org_name_set:
    if record[0] in people_id_clean:
        people_org_id.add((people_id_clean[record[0]], org_id_clean[record[1]], record[2]))
    else:
        manual_upload.append(record)
for record in org_relation_name:
    org_relation_id.add((org_id_clean[record[0]], org_id_clean[record[1]], record[2]))
for record in bio_relation_name:
    bio_relation_id.add((bio_id_clean[record[0]], bio_id_clean[record[1]], record[2]))
for record in people_spec_name:
    if record[1] in people_id_clean:
        people_spec_id.add((people_id_clean[record[1]], bio_id_clean[record[0]]))
    else:
        manual_upload.append(record)
print(manual_upload, "Unable to be uploaded, person not found")
print("After manual search, this person's id is 1846")
people_org_id.add((1846, org_id_clean['University of California, Davis, School of Medicine'], 2020))

In [None]:
# Upload relational data
people_org_queries = [Unmerged.people_org(record) for record in people_org_id]
org_relation_queries = [Unmerged.org_relation(record) for record in org_relation_id]
bio_relation_queries = [Unmerged.bio_relation(record) for record in bio_relation_id]
people_spec_queries = [Unmerged.people_spec(record) for record in people_spec_id]
secondary_queries = people_org_queries + org_relation_queries + bio_relation_queries + people_spec_queries
query_types = ["INSERT" for _ in range(len(secondary_queries))]
# SQLConnect.connect_and_query(queries=secondary_queries, types=query_types, database="UnmergedV1")