# ED Classification in DuckDB

This notebook shows how we can classify ED encounters into one of 9 potential categories: 
- **Non-Emergent**: The patientâ€™s initial complaint, vitals signs, medical history, and age indicated that immediate medical care was not
required within 12 hours.
- **Emergent, Primary Care Treatable**: Treatment was required within 12 hours, but care could have been provided in a primary care
setting.
- **Emergent, ED Care Needed, Preventable/Avoidable**: Emergency care was required based on the complaint or procedures or resources used, but the emergent nature of the condition was potentially preventable or avoidable if timely and effective primary care had been provided.
- **Emergent, ED Care Needed, Not Preventable/Avoidable Injury**: Emergency care was required and primary care treatment could not have prevented the condition.
- **Injury**
- **Mental Health Related**
- **Alcohol Related**
- **Drug Related (excluding alcohol)**
- **Not in a Special Category, and Not Classified**

We start from a list of 402 ED encounters, and apply ED classification logic to assign one of the 9 categories above to each encounter. ED encounters are bucketed into one of those 9 categories solely based on their primary diagnosis. This notebook reads in a csv file (`ed_encounters.csv`) that has one row per ED encounter and applies ED classification logic to assign one of 9 categories to each encounter.

In [None]:
# Install DuckDB
!pip install -q duckdb


In [None]:
import duckdb
duckdb.__version__


In [None]:
# Read in necessary data
from pathlib import Path

local_base = Path('data/ed_classification')
remote_base = 'https://raw.githubusercontent.com/tuva-health/tuva-notebooks/main/data/ed_classification'

if local_base.exists():
    ED_ENCOUNTERS_CSV = str(local_base / 'ed_encounters.csv')
    JOHNSTON_ICD10_CSV = str(local_base / 'ed_classification__johnston_icd10.csv')
    JOHNSTON_ICD9_CSV = str(local_base / 'ed_classification__johnston_icd9.csv')
else:
    ED_ENCOUNTERS_CSV = f'{remote_base}/ed_encounters.csv'
    JOHNSTON_ICD10_CSV = f'{remote_base}/ed_classification__johnston_icd10.csv'
    JOHNSTON_ICD9_CSV = f'{remote_base}/ed_classification__johnston_icd9.csv'

print(ED_ENCOUNTERS_CSV)
print(JOHNSTON_ICD10_CSV)
print(JOHNSTON_ICD9_CSV)


In [None]:
con = duckdb.connect('ed_classification.duckdb')


In [None]:
# Enable DuckDB HTTP access for reading remote CSVs in Colab
con.execute("""
INSTALL httpfs;
LOAD httpfs;
""")


In [None]:
# Load input files into DuckDB tables
con.execute(f"""
create or replace table ed_encounters as
select *
from read_csv_auto('{ED_ENCOUNTERS_CSV}', header=true, all_varchar=true);
""")

con.execute(f"""
create or replace table value_set_johnston_icd10 as
select *
from read_csv_auto('{JOHNSTON_ICD10_CSV}', header=true, all_varchar=true);
""")

con.execute(f"""
create or replace table value_set_johnston_icd9 as
select *
from read_csv_auto('{JOHNSTON_ICD9_CSV}', header=true, all_varchar=true);
""")


In [None]:
# See what the 'ed_encounters' table looks like:
con.execute("""
select * 
from ed_encounters
limit 5
""").df()

In [None]:
# See what the 'value_set_johnston_icd10' table looks like:
con.execute("""
select * 
from value_set_johnston_icd10
limit 5
""").df()

In [None]:
# See what the 'value_set_johnston_icd9' table looks like:
con.execute("""
select * 
from value_set_johnston_icd9
limit 5
""").df()

In [None]:
# See number of rows in each table:
con.execute("""
select 'ed_encounters' as table_name, count(*) as row_count from ed_encounters
union all
select 'value_set_johnston_icd10' as table_name, count(*) as row_count from value_set_johnston_icd10
union all
select 'value_set_johnston_icd9' as table_name, count(*) as row_count from value_set_johnston_icd9
""").df()

In [None]:
# Step-by-step CTE logic to assign ED category per encounter
con.execute("""
create or replace table ed_classification_output as



-- We assign each ED encounter a Johnston probability of belonging to each of the possible categories:
with dx_probabilities as (
    select
        e.encounter_id,
        case
            when lower(e.primary_diagnosis_code_type) = 'icd-10-cm' then i10.edcnnpa
            when lower(e.primary_diagnosis_code_type) = 'icd-9-cm' then i9.edcnnpa
            else null
        end as edcnnpa,
        case
            when lower(e.primary_diagnosis_code_type) = 'icd-10-cm' then i10.edcnpa
            when lower(e.primary_diagnosis_code_type) = 'icd-9-cm' then i9.edcnpa
            else null
        end as edcnpa,
        case
            when lower(e.primary_diagnosis_code_type) = 'icd-10-cm' then i10.epct
            when lower(e.primary_diagnosis_code_type) = 'icd-9-cm' then i9.epct
            else null
        end as epct,
        case
            when lower(e.primary_diagnosis_code_type) = 'icd-10-cm' then i10.noner
            when lower(e.primary_diagnosis_code_type) = 'icd-9-cm' then i9.noner
            else null
        end as noner,
        case
            when lower(e.primary_diagnosis_code_type) = 'icd-10-cm' then i10.injury
            when lower(e.primary_diagnosis_code_type) = 'icd-9-cm' then i9.injury
            else null
        end as injury,
        case
            when lower(e.primary_diagnosis_code_type) = 'icd-10-cm' then i10.psych
            when lower(e.primary_diagnosis_code_type) = 'icd-9-cm' then i9.psych
            else null
        end as psych,
        case
            when lower(e.primary_diagnosis_code_type) = 'icd-10-cm' then i10.alcohol
            when lower(e.primary_diagnosis_code_type) = 'icd-9-cm' then i9.alcohol
            else null
        end as alcohol,
        case
            when lower(e.primary_diagnosis_code_type) = 'icd-10-cm' then i10.drug
            when lower(e.primary_diagnosis_code_type) = 'icd-9-cm' then i9.drug
            else null
        end as drug
    from ed_encounters e
    left join value_set_johnston_icd10 i10
      on lower(e.primary_diagnosis_code_type) = 'icd-10-cm'
     and e.primary_diagnosis_code = i10.icd10
    left join value_set_johnston_icd9 i9
      on lower(e.primary_diagnosis_code_type) = 'icd-9-cm'
     and e.primary_diagnosis_code = i9.icd9
),



-- For each ED encounter we select the category with the largest probability:
picked_classification as (
    select
        d.encounter_id,
        case
            when greatest(
                coalesce(cast(d.edcnnpa as double), 0),
                coalesce(cast(d.edcnpa as double), 0),
                coalesce(cast(d.epct as double), 0),
                coalesce(cast(d.noner as double), 0),
                coalesce(cast(d.injury as double), 0),
                coalesce(cast(d.psych as double), 0),
                coalesce(cast(d.alcohol as double), 0),
                coalesce(cast(d.drug as double), 0)
            ) = 0 then 'not_classified'
            when greatest(
                coalesce(cast(d.edcnnpa as double), 0),
                coalesce(cast(d.edcnpa as double), 0),
                coalesce(cast(d.epct as double), 0),
                coalesce(cast(d.noner as double), 0),
                coalesce(cast(d.injury as double), 0),
                coalesce(cast(d.psych as double), 0),
                coalesce(cast(d.alcohol as double), 0),
                coalesce(cast(d.drug as double), 0)
            ) = coalesce(cast(d.edcnnpa as double), 0) then 'edcnnpa'
            when greatest(
                coalesce(cast(d.edcnnpa as double), 0),
                coalesce(cast(d.edcnpa as double), 0),
                coalesce(cast(d.epct as double), 0),
                coalesce(cast(d.noner as double), 0),
                coalesce(cast(d.injury as double), 0),
                coalesce(cast(d.psych as double), 0),
                coalesce(cast(d.alcohol as double), 0),
                coalesce(cast(d.drug as double), 0)
            ) = coalesce(cast(d.edcnpa as double), 0) then 'edcnpa'
            when greatest(
                coalesce(cast(d.edcnnpa as double), 0),
                coalesce(cast(d.edcnpa as double), 0),
                coalesce(cast(d.epct as double), 0),
                coalesce(cast(d.noner as double), 0),
                coalesce(cast(d.injury as double), 0),
                coalesce(cast(d.psych as double), 0),
                coalesce(cast(d.alcohol as double), 0),
                coalesce(cast(d.drug as double), 0)
            ) = coalesce(cast(d.epct as double), 0) then 'epct'
            when greatest(
                coalesce(cast(d.edcnnpa as double), 0),
                coalesce(cast(d.edcnpa as double), 0),
                coalesce(cast(d.epct as double), 0),
                coalesce(cast(d.noner as double), 0),
                coalesce(cast(d.injury as double), 0),
                coalesce(cast(d.psych as double), 0),
                coalesce(cast(d.alcohol as double), 0),
                coalesce(cast(d.drug as double), 0)
            ) = coalesce(cast(d.noner as double), 0) then 'noner'
            when greatest(
                coalesce(cast(d.edcnnpa as double), 0),
                coalesce(cast(d.edcnpa as double), 0),
                coalesce(cast(d.epct as double), 0),
                coalesce(cast(d.noner as double), 0),
                coalesce(cast(d.injury as double), 0),
                coalesce(cast(d.psych as double), 0),
                coalesce(cast(d.alcohol as double), 0),
                coalesce(cast(d.drug as double), 0)
            ) = coalesce(cast(d.injury as double), 0) then 'injury'
            when greatest(
                coalesce(cast(d.edcnnpa as double), 0),
                coalesce(cast(d.edcnpa as double), 0),
                coalesce(cast(d.epct as double), 0),
                coalesce(cast(d.noner as double), 0),
                coalesce(cast(d.injury as double), 0),
                coalesce(cast(d.psych as double), 0),
                coalesce(cast(d.alcohol as double), 0),
                coalesce(cast(d.drug as double), 0)
            ) = coalesce(cast(d.psych as double), 0) then 'psych'
            when greatest(
                coalesce(cast(d.edcnnpa as double), 0),
                coalesce(cast(d.edcnpa as double), 0),
                coalesce(cast(d.epct as double), 0),
                coalesce(cast(d.noner as double), 0),
                coalesce(cast(d.injury as double), 0),
                coalesce(cast(d.psych as double), 0),
                coalesce(cast(d.alcohol as double), 0),
                coalesce(cast(d.drug as double), 0)
            ) = coalesce(cast(d.alcohol as double), 0) then 'alcohol'
            else 'drug'
        end as classification_code
    from dx_probabilities d
),


-- Label the final categories for each ED encounter:
final as (
    select
        encounter_id,
        case classification_code
            when 'noner' then 'Non-Emergent'
            when 'epct' then 'Emergent, Primary Care Treatable'
            when 'edcnpa' then 'Emergent, ED Care Needed, Preventable/Avoidable'
            when 'edcnnpa' then 'Emergent, ED Care Needed, Not Preventable/Avoidable'
            when 'injury' then 'Injury'
            when 'psych' then 'Mental Health Related'
            when 'alcohol' then 'Alcohol Related'
            when 'drug' then 'Drug Related (excluding alcohol)'
            else 'Not in a Special Category, and Not Classified'
        end as ed_classification_description
    from picked_classification
)

select encounter_id, ed_classification_description
from final
order by encounter_id;
""")


In [None]:
# Final output required: encounter_id + ed_classification_description
con.execute("""
select encounter_id, ed_classification_description
from ed_classification_output
order by encounter_id
""").df()


In [None]:
# Optional: category distribution
con.execute("""
select ed_classification_description, count(*) as encounter_count
from ed_classification_output
group by ed_classification_description
order by encounter_count desc
""").df()
