# 02 - Generating Dataset 1

The purpose of this notebook is to explore the dataset created by [02_generate_dataset.ipynb](02_generate_dataset.ipynb)

In [1]:
# Reload functions every time
%load_ext autoreload 
%autoreload 2

In [2]:
import json
import os
import sys

# This will add the src directory to sys.path
# meaning that the privacy_fingerprint will be found
# note it assumes the current working directory is the folder containing this notebook
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir))+'/src')

In [3]:
# The Synthea output was saved to a directory
output_dir = "../experiments/02_generate_dataset"
os.makedirs(output_dir, exist_ok=True)
export_directory = os.path.join(output_dir, "synthea")

## Load the data and explore

The Synthea dataset only saves records of the type specified in expt_config.synthea.encounter_type at the time the experiment was run. Not that useful here

In [4]:

with open(os.path.join(output_dir, "synthea_dataset.json")) as fp:
    synthea_records = json.load(fp)

len(synthea_records)

44

In [5]:
import pandas as pd
enc = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/encounters.csv')
pat = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/patients.csv')
cond = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/conditions.csv')
med = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/medications.csv')
proc = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/procedures.csv')

In [6]:
print("Number of unique patients with hospital inpatient")
print(len(enc[enc.ENCOUNTERCLASS == 'inpatient']['PATIENT'].unique()))
enc[enc.ENCOUNTERCLASS == 'inpatient']['DESCRIPTION'].value_counts()

Number of unique patients with hospital inpatient
41


DESCRIPTION
Drug rehabilitation and detoxification                 41
Admission to intensive care unit (procedure)           13
Hospital admission (procedure)                          9
Encounter for problem (procedure)                       6
Admission to ward (procedure)                           5
Admission to surgical department                        3
Hospital admission for isolation (procedure)            3
Patient transfer to intensive care unit (procedure)     2
Admission to thoracic surgery department                2
Non-urgent orthopedic admission                         2
Follow-up visit (procedure)                             2
Encounter for problem                                   2
Encounter Inpatient                                     1
Hospital admission  for observation (procedure)         1
Name: count, dtype: int64

In [7]:
files_to_process = []
unique_pats_with_inpatient_visit = enc[enc.ENCOUNTERCLASS == 'inpatient']['PATIENT'].unique()
print(len(unique_pats_with_inpatient_visit))

for patient_id in unique_pats_with_inpatient_visit:
    for file_name in os.listdir('../experiments/02_generate_dataset/synthea/fhir'):
        if patient_id in file_name:
            files_to_process.append(file_name)

            



41


In [8]:
# Convert START and STOP to datetime
enc['START'] = pd.to_datetime(enc['START'])
enc['STOP'] = pd.to_datetime(enc['STOP'])

# Calculate time difference
enc['DURATION'] = enc['STOP'] - enc['START']

long_visits = enc.sort_values('DURATION', ascending=False).head(20)['PATIENT'].unique()

files_to_process = []

for patient_id in long_visits:
    for file_name in os.listdir('../experiments/02_generate_dataset/synthea/fhir'):
        if patient_id in file_name:
            files_to_process.append(file_name)

print(len(files_to_process))


18


In [35]:
# Could also include these

# files = os.listdir('../experiments/02_generate_dataset_inpatients/synthea/fhir/')
# print(len(files))
# import pandas as pd
# enc = pd.read_csv('../experiments/02_generate_dataset_inpatients/synthea/csv/encounters.csv')
# enc.DESCRIPTION.value_counts().tail(20)


112


DESCRIPTION
Asthma follow-up                                       27
Follow-up encounter (procedure)                        24
Emergency Room Admission                               20
Emergency Encounter                                    16
Encounter for symptom (procedure)                      14
Death Certification                                    12
Admission to skilled nursing facility (procedure)      12
Admission to ward (procedure)                          10
Emergency hospital admission for asthma                10
Admission to hospice (procedure)                       10
Hospital admission (procedure)                          8
Patient-initiated encounter                             7
Patient transfer to intensive care unit (procedure)     6
Encounter Inpatient                                     5
Hospital admission for isolation (procedure)            2
Allergic disorder initial assessment                    2
Non-urgent orthopedic admission                         1
Al

In [36]:
os.listdir(output_dir)

['.DS_Store',
 'synthea_dataset.json',
 'synthea_dataset_inpatient.json',
 'synthea']

In [9]:
for filename in files_to_process[0:1]:
    print(filename)

Margeret29_Hauck852_811ef822-fab7-3f91-35dd-106227ca25c1.json


In [10]:
patient_dict = {}

for filename in files_to_process:

    encounter_dict = {}
    other_resources = {'Condition': [], 'Procedure': [], 'Observation': [], 'Medication': []}

    with open(output_dir + '/synthea/fhir/' + filename) as f:
        data = json.load(f)

    print(filename)
    records_dict = {}

    for entry in data['entry']:
        full_url = entry['fullUrl']
        resource = entry['resource']
        records_dict[full_url] = resource

    for id, resource in records_dict.items():

        if resource['resourceType'] == 'Patient':
            patient_id = id

        elif resource['resourceType'] == 'Encounter':
            encounter_dict[id] = resource
            encounter_dict[id]['related_resources'] = {'Condition': [], 'Procedure': [], 'Observation': [], 'Medication': []}
            encounter_dict[id]['encounter_type'] = resource['type'][0]['text']
        
        else: 
            # save ids of other resources types, to iterate through later
            resource_type = resource['resourceType']
            if resource_type in other_resources:
                other_resources[resource_type].append(id)


    for resource_type, ids in other_resources.items():
        for other_ in ids:
            related_encounter = records_dict[other_]['encounter']['reference']
            if other_ not in encounter_dict[related_encounter]['related_resources'][resource_type]:
                encounter_dict[related_encounter]['related_resources'][resource_type].append(records_dict[other_])



    patient_dict[patient_id] = encounter_dict


Margeret29_Hauck852_811ef822-fab7-3f91-35dd-106227ca25c1.json
Linn541_Elly836_Kunze215_eb8e5426-96a8-b166-8db2-6fb3d39740c7.json
Porsche32_Sherlyn657_Lesch175_89031c6f-9efc-31a5-5aa8-3d658d47a080.json
Nida800_Zoe32_Kshlerin58_269a888e-f74a-9fcd-0287-3127ac54e1d7.json
Shizuko29_Morar593_aa6ebc6d-72ae-e38c-9f8b-2e0c6af1b6f8.json
Parker433_Ullrich385_0c338433-4b52-8489-5f8e-1683ee2d1bb0.json
Vonnie403_Streich926_f83ba5f7-5e69-3b27-4b62-b5630c59db34.json
Verda448_Balistreri607_6ac2bf6b-6a8c-51e9-a2dd-30eb780264c5.json
Noelia656_Dee580_Rempel203_826ab6b8-9b5e-736e-5271-bd33861cc870.json
Dennis979_Johnna810_Hackett68_4a72cfc5-8fff-d8bb-8f1f-35afd55f3a7e.json
Alva958_Kautzer186_bbbb5bdf-8479-fc9b-48fa-19bf05e7ec85.json
Rigoberto443_Schiller186_0a5b0da7-e9fd-e875-469f-d9cd14eaf993.json
Carolynn568_Genoveva361_Farrell962_5d47bff4-b399-e39a-7564-6e98135929b9.json
Erik495_Corwin846_90ac42bc-cd64-0c37-600c-6484a0a0d01f.json
Lela622_Tressa150_Prosacco716_d98e862f-e3b0-9a56-25df-f1843cea8f8d.json
Re

In [98]:
## Note - I started trying to parse the the data and then thought there was surely a library! However, I have not succeeded with fhir.resources

from datetime import datetime, timedelta

extracted_data = {
    "Encounters": [],
    "Conditions": [],
    "Procedures": []
}

long_visit_dict = {}


for pat_id, encounter_dict in patient_dict.items():

    for id, resource in encounter_dict.items(): 
        # encounter_dict[list(encounter_dict.keys())[0]].keys()#['related_resources']
        # if 'admission' in resource['encounter_type']:
            # print(value['type'][0]['text'])
            # print(resource.keys())

        text_dict = {"Patient": {}, "Encounter": {}, "Condition": {}, "Procedure": {}, "Observation": {}}
        condition_counter, procedure_counter, observation_counter = 0, 0, 0

        text_dict['Patient'] = pat_id

        text_dict['Encounter'] = {
            "Encounter id": id,
            "Encounter Started": datetime.fromisoformat(resource['period']['start']).strftime('%Y-%m-%d %H:%M:%S'),
            "Encounter Ended": datetime.fromisoformat(resource['period']['end']).strftime('%Y-%m-%d %H:%M:%S'),
            "Encounter Duration": datetime.fromisoformat(resource['period']['end']) - 
                datetime.fromisoformat(resource['period']['start']),
            "Hospital Staff": ', '.join([p['individual']['display'] for p in resource['participant']]),
            "Type of admission": resource['type'][0]['text']
        }

        for key, linked_resource in resource['related_resources'].items():
            if linked_resource:
                for item in linked_resource:
                    if item['resourceType'] == 'Condition':
                        condition_counter += 1
                        text_dict["Condition"][condition_counter] = {
                            "Text": item['code']['text'],
                            "First Recorded": item['onsetDateTime'],
                            "Resolved": item.get('abatementDateTime', None)
                        }

                    if item['resourceType'] == 'Procedure':
                        procedure_counter += 1
                        text_dict["Procedure"][procedure_counter] = {
                            "Text": item['code']['text'],
                            "Started": item['performedPeriod']['start'],
                            "Ended": item['performedPeriod']['end']
                        }
                        
                    if item['resourceType'] == 'Medication':
                        print(item)
                        # text_dict["Procedure"] = item['code']['text']
                        # text_dict["Procedure Started"] = item['performedPeriod']['start']
                        # text_dict["Procedure Ended"] = item['performedPeriod']['end']

                    if item['resourceType'] == 'Observation':
                        observation_counter += 1

                        if 'valueQuantity' in item.keys():
                            text_dict["Observation"][observation_counter] = {
                                "Text": item['code']['text'],
                                "Recorded": item['effectiveDateTime'],
                                "Value": item['valueQuantity']['value'],
                                "Units": item['valueQuantity']['unit']
                            } 
                        else:
                            text_dict["Observation"][observation_counter] = {
                                "Text": item['code']['text'],
                                "Recorded": item['effectiveDateTime']
                        }

        extracted_data["Encounters"].append(text_dict)

        if text_dict["Encounter"]["Encounter Duration"] > timedelta(days=1):
            long_visit_dict[text_dict['Patient']] = text_dict




# Processing the data with the new format


for id, text_dict in long_visit_dict.items():
    print(text_dict["Encounter"]["Encounter Duration"])



36 days, 0:15:00
27 days, 18:41:24
29 days, 0:00:00
27 days, 0:15:00
26 days, 0:15:00
25 days, 0:00:00
6 days, 0:00:00
5 days, 7:34:46
5 days, 0:00:00
19 days, 0:59:38
19 days, 0:15:00
19 days, 0:00:00
1 day, 0:55:13
6 days, 0:00:00
2 days, 13:00:00
9 days, 18:07:30
1 day, 0:41:48
15 days, 0:15:00


In [102]:
long_visit_dict[ex]

{'Patient': 'urn:uuid:6ac2bf6b-6a8c-51e9-a2dd-30eb780264c5',
 'Encounter': {'Encounter id': 'urn:uuid:4d563c2d-50f5-a4d8-800d-d9566445e53c',
  'Encounter Started': '2021-01-07 04:51:40',
  'Encounter Ended': '2021-01-12 12:26:26',
  'Encounter Duration': 459286.0,
  'Hospital Staff': 'Dr. Madelaine318 Walker122',
  'Type of admission': 'Hospital admission for isolation (procedure)'},
 'Condition': {1: {'Text': 'Pneumonia (disorder)',
   'First Recorded': '2021-01-07T05:34:26+00:00',
   'Resolved': None},
  2: {'Text': 'Hypoxemia (disorder)',
   'First Recorded': '2021-01-07T05:34:26+00:00',
   'Resolved': None},
  3: {'Text': 'Respiratory distress (finding)',
   'First Recorded': '2021-01-07T05:34:26+00:00',
   'Resolved': None},
  4: {'Text': 'Acute respiratory failure (disorder)',
   'First Recorded': '2021-01-07T05:34:26+00:00',
   'Resolved': None},
  5: {'Text': 'Sepsis caused by virus (disorder)',
   'First Recorded': '2021-01-07T05:34:26+00:00',
   'Resolved': None}},
 'Procedur

In [100]:
long_visit_dict_converted = long_visit_dict.copy()
for id, text_dict in long_visit_dict_converted.items():
    # if datetime.fromisoformat(text_dict["Encounter"]["Encounter Started"]) - datetime.fromisoformat(text_dict["Encounter"]["Encounter Ended"]) > timedelta(days = 1):
    text_dict["Encounter"]["Encounter Duration"] = text_dict["Encounter"]["Encounter Duration"].total_seconds()
    long_visit_dict_converted[id] = text_dict


with open(output_dir + '/long_visit_dict.json', 'w') as outfile:
    json.dump(long_visit_dict_converted, outfile)


long_visit_dict_converted.keys()

dict_keys(['urn:uuid:811ef822-fab7-3f91-35dd-106227ca25c1', 'urn:uuid:eb8e5426-96a8-b166-8db2-6fb3d39740c7', 'urn:uuid:89031c6f-9efc-31a5-5aa8-3d658d47a080', 'urn:uuid:269a888e-f74a-9fcd-0287-3127ac54e1d7', 'urn:uuid:aa6ebc6d-72ae-e38c-9f8b-2e0c6af1b6f8', 'urn:uuid:0c338433-4b52-8489-5f8e-1683ee2d1bb0', 'urn:uuid:f83ba5f7-5e69-3b27-4b62-b5630c59db34', 'urn:uuid:6ac2bf6b-6a8c-51e9-a2dd-30eb780264c5', 'urn:uuid:826ab6b8-9b5e-736e-5271-bd33861cc870', 'urn:uuid:4a72cfc5-8fff-d8bb-8f1f-35afd55f3a7e', 'urn:uuid:bbbb5bdf-8479-fc9b-48fa-19bf05e7ec85', 'urn:uuid:0a5b0da7-e9fd-e875-469f-d9cd14eaf993', 'urn:uuid:5d47bff4-b399-e39a-7564-6e98135929b9', 'urn:uuid:90ac42bc-cd64-0c37-600c-6484a0a0d01f', 'urn:uuid:d98e862f-e3b0-9a56-25df-f1843cea8f8d', 'urn:uuid:87495c35-77bb-9426-67d9-db97fe9c036c', 'urn:uuid:307cac3e-ffdd-b0bb-bf94-979f13d748e3', 'urn:uuid:43dfe830-5af7-2baf-a37d-a48d0161d1e2'])

In [101]:
ex = "urn:uuid:6ac2bf6b-6a8c-51e9-a2dd-30eb780264c5"
encounter_dict = patient_dict[ex]["urn:uuid:4d563c2d-50f5-a4d8-800d-d9566445e53c"]
encounter_dict['related_resources']['Observation']

# print(long_visit_dict[ex].keys())#['Patient'])
# print(long_visit_dict[ex]['Encounter'])
# long_visit_dict[ex]['Observation']



[{'resourceType': 'Observation',
  'id': '5bc57472-bfd7-3e02-9247-69574e633820',
  'status': 'final',
  'category': [{'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/observation-category',
      'code': 'vital-signs',
      'display': 'vital-signs'}]}],
  'code': {'coding': [{'system': 'http://loinc.org',
     'code': '8310-5',
     'display': 'Body temperature'},
    {'system': 'http://loinc.org',
     'code': '8331-1',
     'display': 'Oral temperature'}],
   'text': 'Body temperature'},
  'subject': {'reference': 'urn:uuid:6ac2bf6b-6a8c-51e9-a2dd-30eb780264c5'},
  'encounter': {'reference': 'urn:uuid:4d563c2d-50f5-a4d8-800d-d9566445e53c'},
  'effectiveDateTime': '2021-01-07T05:34:26+00:00',
  'issued': '2021-01-07T05:34:26.953+00:00',
  'valueQuantity': {'value': 41.724,
   'unit': 'Cel',
   'system': 'http://unitsofmeasure.org',
   'code': 'Cel'}},
 {'resourceType': 'Observation',
  'id': '348d191f-e41b-dd39-e287-ae06dca9a130',
  'status': 'final',
  'category': [{'cod

In [166]:
# import json

# def extract_fhir_data(fhir_json):
#     # Initialize the result dictionary
#     extracted_data = {
#         "Encounters": [],
#         "Conditions": [],
#         "Procedures": []
#     }

#     # Extract Encounter information
#     if "Encounter" in fhir_json:
#         encounter = fhir_json['Encounter']
#         extracted_data["Encounters"].append({
#             "Date": encounter['period']['start'][:10],
#             "Duration": f"{encounter['period']['start'][11:]} - {encounter['period']['end'][11:]}",
#             "Provider": encounter.get('serviceProvider', {}).get('display', 'Unknown'),
#             "Participant": ', '.join([p['individual']['display'] for p in encounter['participant']]),
#             "Type": encounter['type'][0]['text']
#         })

#     # Extract Condition information
#     for condition in fhir_json.get('Condition', []):
#         extracted_data["Conditions"].append({
#             "Status": condition['clinicalStatus']['coding'][0]['display'],
#             "Onset": condition['onsetDateTime'][:10],
#             "Resolution": condition['abatementDateTime'][:10]
#         })

#     # Extract Procedure information
#     for procedure in fhir_json.get('Procedure', []):
#         extracted_data["Procedures"].append({
#             "Name": procedure['code']['text'],
#             "Date": procedure['performedPeriod']['start'][:10],
#             "Duration": f"{procedure['performedPeriod']['start'][11:]} - {procedure['performedPeriod']['end'][11:]}"
#         })

#     return extracted_data

# # Example usage:
# fhir_json = [Your FHIR JSON Data]
# result = extract_fhir_data(fhir_json)
# print(json.dumps(result, indent=4))


{'resourceType': 'Condition',
 'id': 'addc2769-4509-7be4-d48a-d733105a0b50',
 'clinicalStatus': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/condition-clinical',
    'code': 'active'}]},
 'verificationStatus': {'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/condition-ver-status',
    'code': 'confirmed'}]},
 'code': {'coding': [{'system': 'http://snomed.info/sct',
    'code': '428251008',
    'display': 'History of appendectomy'}],
  'text': 'History of appendectomy'},
 'subject': {'reference': 'urn:uuid:cf5c6029-50c2-e2a6-9f1d-529426345c66'},
 'encounter': {'reference': 'urn:uuid:44d6a587-12e2-aa9f-5c06-268fbdaddd6d'},
 'onsetDateTime': '1993-11-15T08:35:02+00:00',
 'recordedDate': '1993-11-15T08:35:02+00:00'}

In [192]:
[index for index, file in enumerate(files) if 'Kling' in files]

files[0]




'Parthenia862_Cole117_de4da970-92c4-0296-0b6c-7d960667d075.json'

In [81]:
related_encounter_types = []
for obs_instance in observations:
    related_encounter = obs_instance['encounter']['reference'].replace("urn:uuid:", "")
    # print(encounters_dict[related_encounter]['type'][0]['text'])
    related_encounter_types.append(encounters_dict[related_encounter]['type'][0]['text'])

print("Types of encounters which have obs")
print(Counter(related_encounter_types))

related_encounter_types = []
for proc_instance in procedures:
    related_encounter = proc_instance['encounter']['reference'].replace("urn:uuid:", "")
    # print(encounters_dict[related_encounter]['type'][0]['text'])
    related_encounter_types.append(encounters_dict[related_encounter]['type'][0]['text'])

print("Types of encounters which have procedures")
Counter(related_encounter_types)

Types of encounters which have obs
Counter({'Encounter for check up (procedure)': 81, 'Encounter for symptom': 73, 'General examination of patient (procedure)': 73, 'Patient encounter procedure': 35, 'Encounter for problem (procedure)': 4, 'Drug rehabilitation and detoxification': 2})
Types of encounters which have procedures


Counter({'Patient encounter procedure': 44,
         'Encounter for problem (procedure)': 38,
         'Encounter for symptom': 21,
         'General examination of patient (procedure)': 9,
         'Admission to skilled nursing facility (procedure)': 8,
         'Encounter for check up (procedure)': 7,
         'Non-urgent orthopedic admission': 2,
         'Emergency room admission (procedure)': 1})

In [36]:
print(encounters[0].keys()) # hospitalization is not in
for _ in encounters:
    if 'hospitalization' in _.keys():
        print(_)

dict_keys(['resourceType', 'id', 'status', 'class', 'type', 'subject', 'participant', 'period', 'reasonCode', 'serviceProvider'])
{'resourceType': 'Encounter', 'id': '44d6a587-12e2-aa9f-5c06-268fbdaddd6d', 'status': 'finished', 'class': {'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'code': 'IMP'}, 'type': [{'coding': [{'system': 'http://snomed.info/sct', 'code': '183452005', 'display': 'Encounter Inpatient'}], 'text': 'Encounter Inpatient'}], 'subject': {'reference': 'urn:uuid:cf5c6029-50c2-e2a6-9f1d-529426345c66', 'display': 'Ms. Simone657 Joey457 Kling921'}, 'participant': [{'type': [{'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/v3-ParticipationType', 'code': 'PPRF', 'display': 'primary performer'}], 'text': 'primary performer'}], 'period': {'start': '1993-11-15T08:35:02+00:00', 'end': '1993-11-20T06:16:30+00:00'}, 'individual': {'reference': 'Practitioner?identifier=http://hl7.org/fhir/sid/us-npi|9999997692', 'display': 'Dr. Rod343 Frami345'}}], 'per

In [38]:
inpat_visits

# Initialize lists to hold labs (Observations) and diagnostic reports
related_observations = []  # This will include lab results
related_diagnostic_reports = []

for entry in data['entry']:
    resource = entry['resource']
    resource_type = resource['resourceType']

    # Check for Observation or DiagnosticReport
    if resource_type in ['Observation', 'DiagnosticReport']:
        # Check if they are related to the inpatient encounters
        if 'encounter' in resource and resource['encounter']['reference'] in inpatient_ids:
            if resource_type == 'Observation':
                related_observations.append(resource)  # This includes lab results
            elif resource_type == 'DiagnosticReport':
                related_diagnostic_reports.append(resource)

[{'resourceType': 'Encounter',
  'id': '44d6a587-12e2-aa9f-5c06-268fbdaddd6d',
  'status': 'finished',
  'class': {'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode',
   'code': 'IMP'},
  'type': [{'coding': [{'system': 'http://snomed.info/sct',
      'code': '183452005',
      'display': 'Encounter Inpatient'}],
    'text': 'Encounter Inpatient'}],
  'subject': {'reference': 'urn:uuid:cf5c6029-50c2-e2a6-9f1d-529426345c66',
   'display': 'Ms. Simone657 Joey457 Kling921'},
  'participant': [{'type': [{'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/v3-ParticipationType',
        'code': 'PPRF',
        'display': 'primary performer'}],
      'text': 'primary performer'}],
    'period': {'start': '1993-11-15T08:35:02+00:00',
     'end': '1993-11-20T06:16:30+00:00'},
    'individual': {'reference': 'Practitioner?identifier=http://hl7.org/fhir/sid/us-npi|9999997692',
     'display': 'Dr. Rod343 Frami345'}}],
  'period': {'start': '1993-11-15T08:35:02+00:00',
   'end'

In [20]:

# Assuming inpat_visits contains your filtered inpatient visits
if inpat_visits:
    # Write all inpatient encounters to a new JSON file
    with open('../experiments/02_generate_dataset_for_tom/synthea/fhir_examples/Angelo118_Franecki195_2.json', 'w') as outfile:
        json.dump(inpat_visits, outfile)

## Explore relationships in Synthea

https://learn.microsoft.com/en-us/fabric/data-science/tutorial-relationships-detection

This requires Azure

In [44]:
# import sempy
# from sempy.relationships import (
#     find_relationships,
#     list_relationship_violations,
#     plot_relationship_metadata
# )

## Load background tables 

The structured notes from Synthea can then be converted to free-text clinical notes.

In [46]:
import pandas as pd
enc = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/encounters.csv')
pat = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/patients.csv')
cond = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/conditions.csv')
med = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/medications.csv')
proc = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/procedures.csv')

In [23]:
inp = enc[enc.ENCOUNTERCLASS == 'inpatient']
# inp.head(2).T
    # enc.ENCOUNTERCLASS.value_counts()
# len(inp.PATIENT.unique())#.size

NameError: name 'enc' is not defined

In [24]:
ex_pat = '5cb0dac2-b3e6-3018-ea64-814bf7bcd780'



In [127]:
inp[inp.PATIENT == ex_pat]

cond[cond.PATIENT == ex_pat]
med[med.PATIENT == ex_pat]

proc[(proc.ENCOUNTER.isin(inp.Id.values)) & (proc.PATIENT == ex_pat)]

Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
808,cb338fc9-c430-0671-e0c4-e4d805815cc6,2020-11-01T19:55:43Z,2020-11-10T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,9e0eba2f-7955-3835-9ef0-cc75aa3fdad2,5fc8e82e-9d1c-3b15-9281-2a58e6ce33ae,329794ac-8260-3252-90dd-cc5284fe15b9,inpatient,1505002,Hospital admission for isolation (procedure),125.0,22281.21,22281.21,840539006.0,COVID-19
826,19f506f5-920f-89f4-7040-cc1e2d2be8f3,2021-01-23T19:43:01Z,2021-01-24T19:43:01Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,9e0eba2f-7955-3835-9ef0-cc75aa3fdad2,5fc8e82e-9d1c-3b15-9281-2a58e6ce33ae,329794ac-8260-3252-90dd-cc5284fe15b9,inpatient,305408004,Admission to surgical department,125.0,9163.8,9163.8,,


In [31]:
import privacy_fingerprint.generate.language_model as llm

from privacy_fingerprint.common.config import (
    load_experiment_config,
    load_experiment_config_from_file,
    load_global_config_from_file,
)
# config = load_experiment_config_from_file()
config = load_experiment_config_from_file("../configs/experiment_config.yaml")

config.openai.model

'text-davinci-003'

In [34]:
config = load_global_config_from_file("../configs/global_config.yaml")


FileNotFoundError: [Errno 2] No such file or directory: '../configs/global_config.yaml'

In [33]:
clinical_note_generator = llm.LMGenerator()
llm_results = list(clinical_note_generator.generate_text(inpat_visits))

with open('../experiments/02_generate_dataset_for_tom/synthea/fhir_examples/Angelo118_Franecki195_note.json', 'w') as fp:
    json.dump(llm_results, fp)

AttributeError: 'NoneType' object has no attribute 'cache'

In [None]:
# If using a previously generated set of records they can be loaded as follows:

with open(os.path.join(output_dir, "llm_dataset.json")) as fp:
    llm_results = json.load(fp)

In [None]:
# The NER step using AWS ComprehendMedical is the most expensive step.
# The cost can be estimated with the following function:

print("Estimated cost is $", aws.calculate_ner_cost(llm_results))

In [None]:
aws_extract = aws.ComprehendExtractor()
ner_records = [aws_extract.extract_record(r) for r in llm_results]

with open(os.path.join(output_dir, "ner_dataset.json"), "w") as fp:
    json.dump(ner_records, fp)

In [None]:
# If using a previously generated set of records they can be loaded as follows:

with open(os.path.join(output_dir, "ner_dataset.json")) as fp:
    ner_records = json.load(fp)

With the raw NER results generated, experiments will move to individual notebooks.