# 02 - Generating Dataset 1

The purpose of this notebook is to explore the dataset created by [02_generate_dataset.ipynb](02_generate_dataset.ipynb)

In [1]:
# Reload functions every time
%load_ext autoreload 
%autoreload 2

In [2]:
import json
import os
import sys

# This will add the src directory to sys.path
# meaning that the privacy_fingerprint will be found
# note it assumes the current working directory is the folder containing this notebook
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir))+'/src')

In [4]:
# The Synthea output was saved to a directory
output_dir = "../experiments/02_generate_dataset"
os.makedirs(output_dir, exist_ok=True)
export_directory = os.path.join(output_dir, "synthea")

The Synthea dataset only saves 1000 records, all of which are encounter for symptom

In [5]:

with open(os.path.join(output_dir, "synthea_dataset.json")) as fp:
    synthea_records = json.load(fp)

records_dict = {}
for record in synthea_records:
    nhs_number = record['NHS number']
    records_dict[nhs_number] = record
records_dict
print(len(records_dict))

visit_types = []

for record in records_dict.values():
    visit_type = record['visit type']
    visit_types.append(visit_type)

from collections import Counter
visit_type_counts = Counter(visit_types)

print(visit_type_counts)

44
Counter({'Encounter Inpatient': 44})


In [12]:
ex = list(records_dict.keys())[0]

for key, value in records_dict[ex].items():
    print(value)

Ms. Simone Kling
596 827 0617
411 Gibson Port Apt 51, Crawley, SO21 6PZ
1974-05-15
single
White - British
female
Encounter Inpatient
1993-11-15T08:35:02Z
{'doctor': 'Rod Frami', 'facility': 'Naomi House & Jacksplace'}
Appendicitis
['History of appendectomy']


In [19]:
for _ in os.listdir('../experiments/02_generate_dataset/synthea/fhir'):
    if "Kling" in _ and "Simone" in _:
        print(_)

Simone657_Kling921_cf5c6029-50c2-e2a6-9f1d-529426345c66.json


Here I'm experimenting with one FHIR bundle 

In [78]:

from collections import Counter
import json


with open('../experiments/02_generate_dataset/synthea/fhir/Simone657_Kling921_cf5c6029-50c2-e2a6-9f1d-529426345c66.json') as f:
   data = json.load(f)

records_dict = {}
resource_types = []
encounters = []
observations = []
procedures = []

for entry in data['entry']:
    full_url = entry['fullUrl']
    resource = entry['resource']
    records_dict[full_url] = resource

    resource_type = resource['resourceType']
    resource_types.append(resource_type)

    if resource['resourceType'] == 'Encounter':
        encounters.append(resource)
    if resource['resourceType'] == 'Observation':
        observations.append(resource)
    if resource['resourceType'] == 'Procedure':
        procedures.append(resource)

type_counts = Counter(resource_types)

print("Resource types for this patient")
print(type_counts) 
# print(encounters[0]['id'])

# convert encounters objects to dict
encounters_dict = {}
for encounter in encounters:
  id = encounter['id']
  encounters_dict[id] = encounter

# explore types of encounters 
encounter_types = []

for key,value in encounters_dict.items():

  for type in value['type']:
    
    if 'text' in type:
      encounter_type = type['text']
      encounter_types.append(encounter_type) 

type_counts = Counter(encounter_types)
print(type_counts)

# Narrowing to inpatient visits
inpat_visits = []

for id, encounter in encounters_dict.items():
  for type in encounter['type']:
    # if 'admission' in type.get('text', '').lower():
    if 'encounter inpatient' in type.get('text', '').lower():
      inpat_visits.append(encounter)

# identif

for encounter in inpat_visits:
  
  print(encounter['id'])
  start = encounter['period']['start']
  end = encounter['period']['end']

  # Initialize lists to hold labs (Observations) and diagnostic reports
  related_observations = []  # This will include lab results
  related_procedures = []

  for entry in data['entry']:
      resource = entry['resource']
      resource_type = resource['resourceType']

      # Check for Observation or DiagnosticReport
      if resource_type in ['Observation', 'Procedure']:

          # Check if they are related to the inpatient encounters
        if 'encounter' in resource: #and encounter['id'] in resource['encounter']['reference']:
            # print(resource['encounter']['reference'])
            if resource_type == 'Observation':
                related_observations.append(resource)  # This includes lab results
            elif resource_type == 'Procedure':
                related_procedures.append(resource)
    







Resource types for this patient
Counter({'Observation': 268, 'Procedure': 130, 'Claim': 120, 'ExplanationOfBenefit': 120, 'Encounter': 117, 'Condition': 53, 'DiagnosticReport': 34, 'SupplyDelivery': 15, 'Immunization': 12, 'Device': 3, 'CareTeam': 3, 'CarePlan': 3, 'MedicationRequest': 3, 'Patient': 1, 'ImagingStudy': 1})
Counter({'Patient encounter procedure': 40, 'Encounter for problem (procedure)': 35, 'General examination of patient (procedure)': 10, 'Encounter for symptom': 9, 'Encounter for check up (procedure)': 7, 'Emergency treatment (procedure)': 3, 'Emergency room admission (procedure)': 3, 'Prenatal initial visit': 2, 'Drug rehabilitation and detoxification': 2, 'Emergency Room Admission': 1, 'Encounter Inpatient': 1, 'Encounter for problem': 1, 'Non-urgent orthopedic admission': 1, 'Admission to skilled nursing facility (procedure)': 1, 'Urgent care clinic (procedure)': 1})
44d6a587-12e2-aa9f-5c06-268fbdaddd6d


In [81]:
related_encounter_types = []
for obs_instance in observations:
    related_encounter = obs_instance['encounter']['reference'].replace("urn:uuid:", "")
    # print(encounters_dict[related_encounter]['type'][0]['text'])
    related_encounter_types.append(encounters_dict[related_encounter]['type'][0]['text'])

print("Types of encounters which have obs")
print(Counter(related_encounter_types))

related_encounter_types = []
for proc_instance in procedures:
    related_encounter = proc_instance['encounter']['reference'].replace("urn:uuid:", "")
    # print(encounters_dict[related_encounter]['type'][0]['text'])
    related_encounter_types.append(encounters_dict[related_encounter]['type'][0]['text'])

print("Types of encounters which have procedures")
Counter(related_encounter_types)

Types of encounters which have obs
Counter({'Encounter for check up (procedure)': 81, 'Encounter for symptom': 73, 'General examination of patient (procedure)': 73, 'Patient encounter procedure': 35, 'Encounter for problem (procedure)': 4, 'Drug rehabilitation and detoxification': 2})
Types of encounters which have procedures


Counter({'Patient encounter procedure': 44,
         'Encounter for problem (procedure)': 38,
         'Encounter for symptom': 21,
         'General examination of patient (procedure)': 9,
         'Admission to skilled nursing facility (procedure)': 8,
         'Encounter for check up (procedure)': 7,
         'Non-urgent orthopedic admission': 2,
         'Emergency room admission (procedure)': 1})

In [36]:
print(encounters[0].keys()) # hospitalization is not in
for _ in encounters:
    if 'hospitalization' in _.keys():
        print(_)

dict_keys(['resourceType', 'id', 'status', 'class', 'type', 'subject', 'participant', 'period', 'reasonCode', 'serviceProvider'])
{'resourceType': 'Encounter', 'id': '44d6a587-12e2-aa9f-5c06-268fbdaddd6d', 'status': 'finished', 'class': {'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode', 'code': 'IMP'}, 'type': [{'coding': [{'system': 'http://snomed.info/sct', 'code': '183452005', 'display': 'Encounter Inpatient'}], 'text': 'Encounter Inpatient'}], 'subject': {'reference': 'urn:uuid:cf5c6029-50c2-e2a6-9f1d-529426345c66', 'display': 'Ms. Simone657 Joey457 Kling921'}, 'participant': [{'type': [{'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/v3-ParticipationType', 'code': 'PPRF', 'display': 'primary performer'}], 'text': 'primary performer'}], 'period': {'start': '1993-11-15T08:35:02+00:00', 'end': '1993-11-20T06:16:30+00:00'}, 'individual': {'reference': 'Practitioner?identifier=http://hl7.org/fhir/sid/us-npi|9999997692', 'display': 'Dr. Rod343 Frami345'}}], 'per

In [38]:
inpat_visits

# Initialize lists to hold labs (Observations) and diagnostic reports
related_observations = []  # This will include lab results
related_diagnostic_reports = []

for entry in data['entry']:
    resource = entry['resource']
    resource_type = resource['resourceType']

    # Check for Observation or DiagnosticReport
    if resource_type in ['Observation', 'DiagnosticReport']:
        # Check if they are related to the inpatient encounters
        if 'encounter' in resource and resource['encounter']['reference'] in inpatient_ids:
            if resource_type == 'Observation':
                related_observations.append(resource)  # This includes lab results
            elif resource_type == 'DiagnosticReport':
                related_diagnostic_reports.append(resource)

[{'resourceType': 'Encounter',
  'id': '44d6a587-12e2-aa9f-5c06-268fbdaddd6d',
  'status': 'finished',
  'class': {'system': 'http://terminology.hl7.org/CodeSystem/v3-ActCode',
   'code': 'IMP'},
  'type': [{'coding': [{'system': 'http://snomed.info/sct',
      'code': '183452005',
      'display': 'Encounter Inpatient'}],
    'text': 'Encounter Inpatient'}],
  'subject': {'reference': 'urn:uuid:cf5c6029-50c2-e2a6-9f1d-529426345c66',
   'display': 'Ms. Simone657 Joey457 Kling921'},
  'participant': [{'type': [{'coding': [{'system': 'http://terminology.hl7.org/CodeSystem/v3-ParticipationType',
        'code': 'PPRF',
        'display': 'primary performer'}],
      'text': 'primary performer'}],
    'period': {'start': '1993-11-15T08:35:02+00:00',
     'end': '1993-11-20T06:16:30+00:00'},
    'individual': {'reference': 'Practitioner?identifier=http://hl7.org/fhir/sid/us-npi|9999997692',
     'display': 'Dr. Rod343 Frami345'}}],
  'period': {'start': '1993-11-15T08:35:02+00:00',
   'end'

In [20]:

# Assuming inpat_visits contains your filtered inpatient visits
if inpat_visits:
    # Write all inpatient encounters to a new JSON file
    with open('../experiments/02_generate_dataset_for_tom/synthea/fhir_examples/Angelo118_Franecki195_2.json', 'w') as outfile:
        json.dump(inpat_visits, outfile)

## Explore relationships in Synthea

https://learn.microsoft.com/en-us/fabric/data-science/tutorial-relationships-detection

This requires Azure

In [44]:
# import sempy
# from sempy.relationships import (
#     find_relationships,
#     list_relationship_violations,
#     plot_relationship_metadata
# )

## Load background tables 

The structured notes from Synthea can then be converted to free-text clinical notes.

In [46]:
import pandas as pd
enc = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/encounters.csv')
pat = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/patients.csv')
cond = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/conditions.csv')
med = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/medications.csv')
proc = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/procedures.csv')

In [23]:
inp = enc[enc.ENCOUNTERCLASS == 'inpatient']
# inp.head(2).T
    # enc.ENCOUNTERCLASS.value_counts()
# len(inp.PATIENT.unique())#.size

NameError: name 'enc' is not defined

In [24]:
ex_pat = '5cb0dac2-b3e6-3018-ea64-814bf7bcd780'



In [127]:
inp[inp.PATIENT == ex_pat]

cond[cond.PATIENT == ex_pat]
med[med.PATIENT == ex_pat]

proc[(proc.ENCOUNTER.isin(inp.Id.values)) & (proc.PATIENT == ex_pat)]

Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
808,cb338fc9-c430-0671-e0c4-e4d805815cc6,2020-11-01T19:55:43Z,2020-11-10T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,9e0eba2f-7955-3835-9ef0-cc75aa3fdad2,5fc8e82e-9d1c-3b15-9281-2a58e6ce33ae,329794ac-8260-3252-90dd-cc5284fe15b9,inpatient,1505002,Hospital admission for isolation (procedure),125.0,22281.21,22281.21,840539006.0,COVID-19
826,19f506f5-920f-89f4-7040-cc1e2d2be8f3,2021-01-23T19:43:01Z,2021-01-24T19:43:01Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,9e0eba2f-7955-3835-9ef0-cc75aa3fdad2,5fc8e82e-9d1c-3b15-9281-2a58e6ce33ae,329794ac-8260-3252-90dd-cc5284fe15b9,inpatient,305408004,Admission to surgical department,125.0,9163.8,9163.8,,


In [31]:
import privacy_fingerprint.generate.language_model as llm

from privacy_fingerprint.common.config import (
    load_experiment_config,
    load_experiment_config_from_file,
    load_global_config_from_file,
)
# config = load_experiment_config_from_file()
config = load_experiment_config_from_file("../configs/experiment_config.yaml")

config.openai.model

'text-davinci-003'

In [34]:
config = load_global_config_from_file("../configs/global_config.yaml")


FileNotFoundError: [Errno 2] No such file or directory: '../configs/global_config.yaml'

In [33]:
clinical_note_generator = llm.LMGenerator()
llm_results = list(clinical_note_generator.generate_text(inpat_visits))

with open('../experiments/02_generate_dataset_for_tom/synthea/fhir_examples/Angelo118_Franecki195_note.json', 'w') as fp:
    json.dump(llm_results, fp)

AttributeError: 'NoneType' object has no attribute 'cache'

In [None]:
# If using a previously generated set of records they can be loaded as follows:

with open(os.path.join(output_dir, "llm_dataset.json")) as fp:
    llm_results = json.load(fp)

In [None]:
# The NER step using AWS ComprehendMedical is the most expensive step.
# The cost can be estimated with the following function:

print("Estimated cost is $", aws.calculate_ner_cost(llm_results))

In [None]:
aws_extract = aws.ComprehendExtractor()
ner_records = [aws_extract.extract_record(r) for r in llm_results]

with open(os.path.join(output_dir, "ner_dataset.json"), "w") as fp:
    json.dump(ner_records, fp)

In [None]:
# If using a previously generated set of records they can be loaded as follows:

with open(os.path.join(output_dir, "ner_dataset.json")) as fp:
    ner_records = json.load(fp)

With the raw NER results generated, experiments will move to individual notebooks.