# 02 - Generating Dataset 1

The purpose of this notebook is to explore the dataset created by [02_generate_dataset.ipynb](02_generate_dataset.ipynb)

In [2]:
# Reload functions every time
%load_ext autoreload 
%autoreload 2

In [3]:
import json
import os
import sys

# This will add the src directory to sys.path
# meaning that the privacy_fingerprint will be found
# note it assumes the current working directory is the folder containing this notebook
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), os.pardir))+'/src')

In [4]:
# The Synthea output was saved to a directory
output_dir = "../experiments/02_generate_dataset"
os.makedirs(output_dir, exist_ok=True)
export_directory = os.path.join(output_dir, "synthea")

The Synthea dataset only saves 1000 records, all of which are encounter for symptom

In [111]:

with open(os.path.join(output_dir, "synthea_dataset.json")) as fp:
    synthea_dataset = json.load(fp)

records_dict = {}
for record in synthea_records:
    nhs_number = record['NHS number']
    records_dict[nhs_number] = record
records_dict
print(len(records_dict))

visit_types = []

for record in records_dict.values():
    visit_type = record['visit type']
    visit_types.append(visit_type)

from collections import Counter
visit_type_counts = Counter(visit_types)

print(visit_type_counts)

994
Counter({'Encounter for symptom': 994})


Here I'm experimenting with one FHIR bundle, for a patient who appears to have had an inpatient admission for Covid (see later). In the FHIR bundle, she has no record of the Covid admission

In [112]:

from collections import Counter


with open('../experiments/02_generate_dataset/synthea/fhir/Evita885_Marg761_Glover433_5cb0dac2-b3e6-3018-ea64-814bf7bcd780.json') as f:
   data = json.load(f)

records_dict = {}
resource_types = []
encounters = []

for entry in data['entry']:
    full_url = entry['fullUrl']
    resource = entry['resource']
    records_dict[full_url] = resource

    resource_type = resource['resourceType']
    resource_types.append(resource_type)

    if resource['resourceType'] == 'Encounter':
        encounters.append(resource)

type_counts = Counter(resource_types)

print(records_dict['urn:uuid:5cb0dac2-b3e6-3018-ea64-814bf7bcd780']['name'][0]['family'])
# Glover433

print(type_counts) 
print(encounters[0]['id'])

# convert encounters objects to dict
encounters_dict = {}
for encounter in encounters:
  id = encounter['id']
  encounters_dict[id] = encounter

# explore types of encounters 
encounter_types = []

for key,value in encounters_dict.items():

  for type in value['type']:
    
    if 'text' in type:
      encounter_type = type['text']
      encounter_types.append(encounter_type) 

type_counts = Counter(encounter_types)
print(type_counts)

# Narrowing to obstretric emergencies
obstetric_emergencies = []

for id, encounter in encounters_dict.items():
  for type in encounter['type']:
    if type.get('text') == 'Obstetric emergency hospital admission':
      obstetric_emergencies.append(encounter)

# obstetric_emergencies

for encounter in obstetric_emergencies:
  
  start = encounter['period']['start']
  end = encounter['period']['end']
  
  print(start, end)



Glover433
Counter({'Observation': 574, 'Procedure': 193, 'Claim': 96, 'ExplanationOfBenefit': 96, 'SupplyDelivery': 67, 'DiagnosticReport': 66, 'Encounter': 60, 'MedicationRequest': 36, 'Condition': 34, 'MedicationAdministration': 30, 'Immunization': 7, 'CareTeam': 6, 'CarePlan': 6, 'Patient': 1, 'Device': 1})
2161cf86-887f-b641-3ab1-64d8e3acafdc
Counter({'Prenatal visit': 26, 'General examination of patient (procedure)': 8, 'Consultation for treatment': 5, 'Patient encounter procedure': 3, 'Prenatal initial visit': 3, 'Obstetric emergency hospital admission': 3, 'Postnatal visit': 3, 'Encounter for symptom': 3, 'Administration of vaccine to produce active immunity (procedure)': 2, 'Encounter for problem': 1, 'Encounter for symptom (procedure)': 1, 'Hospital admission for isolation (procedure)': 1, 'Admission to surgical department': 1})
2015-03-14T18:44:52+00:00 2015-03-14T19:44:52+00:00
2017-03-04T18:44:52+00:00 2017-03-04T19:44:52+00:00
2019-11-30T18:44:52+00:00 2019-11-30T19:44:52+

## Explore relationships in Synthea

https://learn.microsoft.com/en-us/fabric/data-science/tutorial-relationships-detection

This requires Azure

In [44]:
# import sempy
# from sempy.relationships import (
#     find_relationships,
#     list_relationship_violations,
#     plot_relationship_metadata
# )

## Load background tables 

The structured notes from Synthea can then be converted to free-text clinical notes.

In [46]:
import pandas as pd
enc = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/encounters.csv')
pat = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/patients.csv')
cond = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/conditions.csv')
med = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/medications.csv')
proc = pd.read_csv('../experiments/02_generate_dataset/synthea/csv/procedures.csv')

In [58]:
inp = enc[enc.ENCOUNTERCLASS == 'inpatient']
inp.head(2).T
    # enc.ENCOUNTERCLASS.value_counts()
# len(inp.PATIENT.unique())#.size

Unnamed: 0,808,826
Id,cb338fc9-c430-0671-e0c4-e4d805815cc6,19f506f5-920f-89f4-7040-cc1e2d2be8f3
START,2020-11-01T19:55:43Z,2021-01-23T19:43:01Z
STOP,2020-11-10T20:45:13Z,2021-01-24T19:43:01Z
PATIENT,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,5cb0dac2-b3e6-3018-ea64-814bf7bcd780
ORGANIZATION,9e0eba2f-7955-3835-9ef0-cc75aa3fdad2,9e0eba2f-7955-3835-9ef0-cc75aa3fdad2
PROVIDER,5fc8e82e-9d1c-3b15-9281-2a58e6ce33ae,5fc8e82e-9d1c-3b15-9281-2a58e6ce33ae
PAYER,329794ac-8260-3252-90dd-cc5284fe15b9,329794ac-8260-3252-90dd-cc5284fe15b9
ENCOUNTERCLASS,inpatient,inpatient
CODE,1505002,305408004
DESCRIPTION,Hospital admission for isolation (procedure),Admission to surgical department


In [125]:
ex_pat = '5cb0dac2-b3e6-3018-ea64-814bf7bcd780'
print(pat[pat.Id == ex_pat])



                                      Id   BIRTHDATE DEATHDATE          SSN  \
20  5cb0dac2-b3e6-3018-ea64-814bf7bcd780  1985-11-09       NaN  999-88-4951   

      DRIVERS    PASSPORT PREFIX     FIRST       LAST SUFFIX  ...    CITY  \
20  S99990302  X30475346X   Mrs.  Evita885  Glover433    NaN  ...  Totton   

        STATE  COUNTY FIPS   ZIP        LAT       LON HEALTHCARE_EXPENSES  \
20  Hampshire  Totton  NaN  SO40  50.901302 -1.498295                 0.0   

   HEALTHCARE_COVERAGE INCOME  
20          1384624.55  11011  

[1 rows x 27 columns]


In [127]:
inp[inp.PATIENT == ex_pat]


Unnamed: 0,Id,START,STOP,PATIENT,ORGANIZATION,PROVIDER,PAYER,ENCOUNTERCLASS,CODE,DESCRIPTION,BASE_ENCOUNTER_COST,TOTAL_CLAIM_COST,PAYER_COVERAGE,REASONCODE,REASONDESCRIPTION
808,cb338fc9-c430-0671-e0c4-e4d805815cc6,2020-11-01T19:55:43Z,2020-11-10T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,9e0eba2f-7955-3835-9ef0-cc75aa3fdad2,5fc8e82e-9d1c-3b15-9281-2a58e6ce33ae,329794ac-8260-3252-90dd-cc5284fe15b9,inpatient,1505002,Hospital admission for isolation (procedure),125.0,22281.21,22281.21,840539006.0,COVID-19
826,19f506f5-920f-89f4-7040-cc1e2d2be8f3,2021-01-23T19:43:01Z,2021-01-24T19:43:01Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,9e0eba2f-7955-3835-9ef0-cc75aa3fdad2,5fc8e82e-9d1c-3b15-9281-2a58e6ce33ae,329794ac-8260-3252-90dd-cc5284fe15b9,inpatient,305408004,Admission to surgical department,125.0,9163.8,9163.8,,


In [128]:
cond[cond.PATIENT == ex_pat]

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION
454,2004-01-03,,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,3db17420-1502-eda4-dbee-484d5efedeca,224299000,Received higher education (finding)
455,2004-01-03,,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,3db17420-1502-eda4-dbee-484d5efedeca,423315002,Limited social contact (finding)
456,2005-01-08,2008-01-12,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,d9f57186-f2e9-025f-a4c6-08926f8e6080,160903007,Full-time employment (finding)
457,2005-01-08,2023-01-28,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,d9f57186-f2e9-025f-a4c6-08926f8e6080,422650009,Social isolation (finding)
458,2008-01-12,2011-01-15,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,7fec6c49-28ae-7475-1f3c-c493983a3819,160903007,Full-time employment (finding)
459,2008-01-12,2014-01-18,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,7fec6c49-28ae-7475-1f3c-c493983a3819,73595000,Stress (finding)
460,2011-01-15,,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,b73281e3-4b86-201e-f661-7698b0d8f2f5,15777000,Prediabetes
461,2011-01-15,,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,b73281e3-4b86-201e-f661-7698b0d8f2f5,271737000,Anemia (disorder)
462,2011-01-15,2014-01-18,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,270cb310-9b73-dcab-53ad-bcb8a4b4cd36,160904001,Part-time employment (finding)
463,2011-01-15,2014-01-18,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,270cb310-9b73-dcab-53ad-bcb8a4b4cd36,424393004,Reports of violence in the environment (finding)


In [130]:
med[med.PATIENT == ex_pat]


Unnamed: 0,START,STOP,PATIENT,PAYER,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,PAYER_COVERAGE,DISPENSES,TOTALCOST,REASONCODE,REASONDESCRIPTION
604,2013-10-22T18:44:52Z,2014-05-31T19:48:57Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,329794ac-8260-3252-90dd-cc5284fe15b9,0238f43b-1158-6853-142b-c53449f3b732,807283,Mirena 52 MG Intrauterine System,30538.36,30538.36,7,213768.52,,
606,2015-04-25T18:44:52Z,2016-04-19T18:44:52Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,329794ac-8260-3252-90dd-cc5284fe15b9,4639f297-4384-d8d0-0e84-17085788ac8f,1534809,168 HR Ethinyl Estradiol 0.00146 MG/HR / norel...,901.42,901.42,12,10817.04,,
608,2015-10-03T15:44:52Z,2015-10-13T16:44:52Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,329794ac-8260-3252-90dd-cc5284fe15b9,51a1e5b7-96b9-2a6a-acc5-7cde37d6f5ba,834102,Penicillin V Potassium 500 MG Oral Tablet,1217.06,1217.06,1,1217.06,43878008.0,Streptococcal sore throat (disorder)
609,2016-01-24T09:44:52Z,2016-02-01T09:44:52Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,329794ac-8260-3252-90dd-cc5284fe15b9,a22fb104-817e-15fb-72be-b88e8221465c,562251,Amoxicillin 250 MG / Clavulanate 125 MG Oral T...,449.66,449.66,1,449.66,444814009.0,Viral sinusitis (disorder)
632,2017-04-14T18:44:52Z,2018-04-13T07:42:08Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,329794ac-8260-3252-90dd-cc5284fe15b9,88929601-856d-81e8-af40-b8d81519e29e,1605257,Liletta 52 MG Intrauterine System,31426.83,31426.83,12,377121.96,,
654,2020-01-11T18:44:52Z,2021-01-05T18:44:52Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,329794ac-8260-3252-90dd-cc5284fe15b9,d413b3f1-d588-3da3-3218-8c496ad25d24,1367439,NuvaRing 0.12/0.015 MG per 24HR 21 Day Vaginal...,800.43,800.43,12,9605.16,,
682,2020-11-01T20:30:13Z,2020-11-01T20:30:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,329794ac-8260-3252-90dd-cc5284fe15b9,cb338fc9-c430-0671-e0c4-e4d805815cc6,854235,0.4 ML Enoxaparin sodium 100 MG/ML Prefilled S...,24.83,24.83,1,24.83,,
683,2020-11-01T20:30:13Z,2020-11-01T20:30:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,329794ac-8260-3252-90dd-cc5284fe15b9,cb338fc9-c430-0671-e0c4-e4d805815cc6,2123111,NDA020503 200 ACTUAT Albuterol 0.09 MG/ACTUAT ...,12939.55,12939.55,1,12939.55,389087006.0,Hypoxemia (disorder)
684,2020-11-01T20:30:13Z,2020-11-01T20:30:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,329794ac-8260-3252-90dd-cc5284fe15b9,cb338fc9-c430-0671-e0c4-e4d805815cc6,198440,Acetaminophen 500 MG Oral Tablet,0.02,0.02,1,0.02,,
685,2020-11-02T20:30:13Z,2020-11-02T20:30:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,329794ac-8260-3252-90dd-cc5284fe15b9,cb338fc9-c430-0671-e0c4-e4d805815cc6,854235,0.4 ML Enoxaparin sodium 100 MG/ML Prefilled S...,11.64,11.64,1,11.64,,


In [137]:
inp.Id.values

array(['cb338fc9-c430-0671-e0c4-e4d805815cc6',
       '19f506f5-920f-89f4-7040-cc1e2d2be8f3',
       '21eaa02d-0f0e-c40d-ee4a-31601b6f83e4',
       '06f3672e-c007-2ed4-615d-a2ba6e2df364',
       '92d83c1e-fd94-1a80-1e6b-beae84688fb0',
       '671db41c-1d37-be36-e987-2131217d397f',
       '29e658fb-b6c9-e274-cead-8b2949077108',
       'af179613-c2b0-8ea8-8246-25d8e55c8419',
       'f9042e36-a02e-0e2d-b81f-372b410bcfb8',
       '2613a770-81b0-f63a-7ed3-a4a53b402e63',
       '299f48a2-fb32-fbf3-c0da-6b7ca40d0ce4',
       '7924dd07-2b89-1254-d35d-159b6a7136d6',
       '7dc43d39-fdf5-1c9e-9151-8bd601cc758f',
       'ee15a422-2c78-ae13-7e65-d0d70013f28c',
       'fe6e4a56-d288-e578-d06b-6bba982c60cb',
       '78fe7883-ab5f-4cee-b1fe-6b43d3a3a121',
       '2255f513-552a-d932-e276-045d71bf2304',
       'b33df2ff-8631-0335-5ebb-a7eb5122e6ca',
       'd57f810e-fb54-42b5-4a88-6b0feb8b6b13',
       '2f0a32c4-4a13-114d-23b2-bb7f09205001',
       '6f76bd3e-04bd-c6d0-fc82-7304d7dd734d',
       '96a8f

In [145]:
proc[(proc.ENCOUNTER.isin(inp.Id.values)) & (proc.PATIENT == ex_pat)]

Unnamed: 0,START,STOP,PATIENT,ENCOUNTER,CODE,DESCRIPTION,BASE_COST,REASONCODE,REASONDESCRIPTION
1494,2020-11-01T19:55:43Z,2020-11-01T20:30:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,cb338fc9-c430-0671-e0c4-e4d805815cc6,399208008,Plain chest X-ray (procedure),7656.21,,
1495,2020-11-01T20:30:13Z,2020-11-01T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,cb338fc9-c430-0671-e0c4-e4d805815cc6,371908008,Oxygen administration by mask (procedure),500.0,389087006.0,Hypoxemia (disorder)
1496,2020-11-01T20:30:13Z,2020-11-01T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,cb338fc9-c430-0671-e0c4-e4d805815cc6,431182000,Placing subject in prone position (procedure),500.0,389087006.0,Hypoxemia (disorder)
1497,2020-11-02T20:30:13Z,2020-11-02T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,cb338fc9-c430-0671-e0c4-e4d805815cc6,371908008,Oxygen administration by mask (procedure),500.0,389087006.0,Hypoxemia (disorder)
1498,2020-11-02T20:30:13Z,2020-11-02T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,cb338fc9-c430-0671-e0c4-e4d805815cc6,431182000,Placing subject in prone position (procedure),500.0,389087006.0,Hypoxemia (disorder)
1499,2020-11-03T20:30:13Z,2020-11-03T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,cb338fc9-c430-0671-e0c4-e4d805815cc6,371908008,Oxygen administration by mask (procedure),500.0,389087006.0,Hypoxemia (disorder)
1500,2020-11-03T20:30:13Z,2020-11-03T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,cb338fc9-c430-0671-e0c4-e4d805815cc6,431182000,Placing subject in prone position (procedure),500.0,389087006.0,Hypoxemia (disorder)
1501,2020-11-04T20:30:13Z,2020-11-04T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,cb338fc9-c430-0671-e0c4-e4d805815cc6,371908008,Oxygen administration by mask (procedure),500.0,389087006.0,Hypoxemia (disorder)
1502,2020-11-04T20:30:13Z,2020-11-04T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,cb338fc9-c430-0671-e0c4-e4d805815cc6,431182000,Placing subject in prone position (procedure),500.0,389087006.0,Hypoxemia (disorder)
1503,2020-11-05T20:30:13Z,2020-11-05T20:45:13Z,5cb0dac2-b3e6-3018-ea64-814bf7bcd780,cb338fc9-c430-0671-e0c4-e4d805815cc6,371908008,Oxygen administration by mask (procedure),500.0,389087006.0,Hypoxemia (disorder)


In [None]:
clinical_note_generator = llm.LMGenerator()
llm_results = list(clinical_note_generator.generate_text(synthea_records))

with open(os.path.join(output_dir, "llm_dataset.json"), "w") as fp:
    json.dump(llm_results, fp)

In [None]:
# If using a previously generated set of records they can be loaded as follows:

with open(os.path.join(output_dir, "llm_dataset.json")) as fp:
    llm_results = json.load(fp)

In [None]:
# The NER step using AWS ComprehendMedical is the most expensive step.
# The cost can be estimated with the following function:

print("Estimated cost is $", aws.calculate_ner_cost(llm_results))

In [None]:
aws_extract = aws.ComprehendExtractor()
ner_records = [aws_extract.extract_record(r) for r in llm_results]

with open(os.path.join(output_dir, "ner_dataset.json"), "w") as fp:
    json.dump(ner_records, fp)

In [None]:
# If using a previously generated set of records they can be loaded as follows:

with open(os.path.join(output_dir, "ner_dataset.json")) as fp:
    ner_records = json.load(fp)

With the raw NER results generated, experiments will move to individual notebooks.