# Oncology Population Analysis

Querying the dbt oncology models from DuckDB.

In [21]:
import duckdb

# Connect to the dbt DuckDB database (one level up from analyses folder)
con = duckdb.connect('../local.duckdb', read_only=True)

# List available tables
con.sql("SHOW TABLES").show()

┌───────────────────────────────┐
│             name              │
│            varchar            │
├───────────────────────────────┤
│ alerts_anomaly_detection      │
│ alerts_dbt_models             │
│ alerts_dbt_source_freshness   │
│ alerts_dbt_tests              │
│ alerts_schema_changes         │
│ anomaly_threshold_sensitivity │
│ data_monitoring_metrics       │
│ dbt_artifacts_hashes          │
│ dbt_columns                   │
│ dbt_exposures                 │
│    ·                          │
│    ·                          │
│    ·                          │
│ metadata                      │
│ metrics_anomaly_score         │
│ model_run_results             │
│ monitors_runs                 │
│ schema_columns_snapshot       │
│ seed_run_results              │
│ snapshot_run_results          │
│ stg_oncology_claims           │
│ stg_oncology_conditions       │
│ test_result_rows              │
├───────────────────────────────┤
│      35 rows (20 shown)       │
└─────────────

## Cohort Overview

In [22]:
# Oncology cohort summary
cohort_df = con.sql("""
    SELECT 
        primary_cancer_type,
        COUNT(*) as patient_count,
        AVG(cancer_claim_count) as avg_claims_per_patient
    FROM main.int_oncology_cohort
    GROUP BY 1
    ORDER BY patient_count DESC
""").df()

cohort_df

Unnamed: 0,primary_cancer_type,patient_count,avg_claims_per_patient
0,Benign Neoplasms,225,7.146667
1,,123,8.634146
2,Male Genital Organs,33,17.0
3,Uncertain Behavior,24,9.791667
4,Hematologic,12,30.0
5,Breast,12,21.25
6,Urinary Tract,11,18.727273
7,Thyroid & Endocrine Glands,7,18.428571
8,Digestive Organs,6,5.333333
9,In Situ Neoplasms,5,31.6


## Cost by Care Setting

In [23]:
# Cost breakdown by care setting
cost_by_setting = con.sql("""
    SELECT 
        care_setting,
        SUM(total_paid_amount) as total_paid,
        SUM(patient_count) as patients,
        ROUND(SUM(total_paid_amount) / SUM(patient_count), 2) as paid_per_patient
    FROM main.fact_oncology_cost_analytics
    GROUP BY 1
    ORDER BY total_paid DESC
""").df()

cost_by_setting

Unnamed: 0,care_setting,total_paid,patients,paid_per_patient
0,inpatient,2899960.0,240.0,12083.17
1,outpatient,2467336.0,468.0,5272.09
2,office-based,2130688.0,469.0,4543.04
3,ancillary,1218657.0,471.0,2587.38
4,other,8265.6,38.0,217.52


## Cost by Cancer Type

In [24]:
# Cost breakdown by cancer type
cost_by_cancer = con.sql("""
    SELECT 
        primary_cancer_type,
        SUM(total_paid_amount) as total_paid,
        SUM(patient_count) as patients,
        ROUND(SUM(total_paid_amount) / SUM(patient_count), 2) as paid_per_patient
    FROM main.fact_oncology_cost_analytics
    GROUP BY 1
    ORDER BY total_paid DESC
""").df()

cost_by_cancer

Unnamed: 0,primary_cancer_type,total_paid,patients,paid_per_patient
0,Benign Neoplasms,4266860.0,799.0,5340.25
1,,2116283.0,439.0,4820.69
2,Male Genital Organs,602582.1,117.0,5150.27
3,Uncertain Behavior,461991.0,88.0,5249.9
4,Hematologic,299816.0,44.0,6814.0
5,Breast,226487.8,45.0,5033.06
6,Urinary Tract,202932.4,40.0,5073.31
7,Respiratory & Intrathoracic,118926.4,15.0,7928.42
8,In Situ Neoplasms,108993.3,18.0,6055.18
9,Digestive Organs,87661.84,20.0,4383.09


## Spend Bucket Distribution

In [25]:
# Spend bucket analysis
spend_buckets = con.sql("""
    SELECT 
        spend_bucket,
        SUM(total_paid_amount) as total_paid,
        SUM(patient_count) as patients,
        ROUND(SUM(total_paid_amount) * 100.0 / SUM(SUM(total_paid_amount)) OVER (), 2) as pct_of_spend
    FROM main.fact_oncology_cost_analytics
    GROUP BY 1
    ORDER BY total_paid DESC
""").df()

spend_buckets

Unnamed: 0,spend_bucket,total_paid,patients,pct_of_spend
0,Medium Cost ($25k-$100k),4379766.0,445.0,50.2
1,Low Cost ($5k-$25k),3575848.0,983.0,40.98
2,High Cost (>$100k),533017.1,17.0,6.11
3,Minimal Cost (<$5k),236276.0,241.0,2.71


## Fact Table

In [26]:
# Query the full fact table
fact_cost_df = con.sql("""
    SELECT *
    FROM main.fact_oncology_cost_analytics
    ORDER BY total_paid_amount DESC
    LIMIT 50
""").df()

fact_cost_df

Unnamed: 0,care_setting,primary_cancer_type,spend_bucket,spend_quartile,patient_count,total_paid_amount,total_allowed_amount,total_cost_amount,total_claims,avg_patient_total_spend,pct_of_total_paid,paid_per_patient,claims_per_patient
0,inpatient,Benign Neoplasms,Medium Cost ($25k-$100k),4,46,845835.59714,169532.880412,0.0,1468.0,40502.346645,9.69,18387.73,31.91
1,outpatient,Benign Neoplasms,Medium Cost ($25k-$100k),4,48,520880.512,196087.77864,0.0,2496.0,40610.441227,5.97,10851.68,52.0
2,inpatient,,Medium Cost ($25k-$100k),4,20,345974.952517,68029.2302,0.0,563.0,42630.074171,3.97,17298.75,28.15
3,office-based,Benign Neoplasms,Medium Cost ($25k-$100k),4,48,337030.240882,407972.400432,0.0,3131.0,40610.441227,3.86,7021.46,65.23
4,outpatient,,Medium Cost ($25k-$100k),4,22,317999.401122,100489.139841,0.0,833.0,41684.838809,3.64,14454.52,37.86
5,office-based,Benign Neoplasms,Low Cost ($5k-$25k),2,66,297791.740028,394416.690816,0.0,3287.0,11029.821966,3.41,4512.0,49.8
6,inpatient,Benign Neoplasms,High Cost (>$100k),4,3,265102.93175,11873.060028,0.0,98.0,138919.373362,3.04,88367.64,32.67
7,outpatient,Benign Neoplasms,Low Cost ($5k-$25k),2,66,246050.440053,175447.947942,0.0,1784.0,11029.821966,2.82,3728.04,27.03
8,outpatient,Benign Neoplasms,Low Cost ($5k-$25k),3,40,243885.59924,158150.740809,0.0,1643.0,18943.556487,2.8,6097.14,41.08
9,ancillary,Benign Neoplasms,Medium Cost ($25k-$100k),4,48,242330.778853,196630.490389,0.0,2893.0,40610.441227,2.78,5048.56,60.27


In [29]:
# Query the full fact table
fact_patient_df = con.sql("""
    SELECT *
    FROM main.fact_oncology_patient_detail
    LIMIT 50
""").df()

fact_patient_df

Unnamed: 0,person_id,cohort_name,primary_cancer_type,all_cancer_codes,all_cancer_descriptions,cancer_claim_count,most_recent_diagnosis_date,total_paid,total_allowed,total_cost,total_claims,spend_bucket,spend_quartile,care_settings_used,inpatient_paid,outpatient_paid,emergency_paid,office_paid,ancillary_paid
0,10124,Active Oncology,Benign Neoplasms,"[C7A1, D126, D271]",[Malignant poorly differentiated neuroendocrin...,3,2017-08-04,166505.745377,6021.770032,0.0,92.0,High Cost (>$100k),4,"[outpatient, ancillary, inpatient, office-based]",159294.595344,5891.570032,0.0,497.680001,821.9
1,12284,Active Oncology,Benign Neoplasms,"[C44612, C61, D0339, D125, D171, D2239, D2261,...",[Basal cell carcinoma of skin of right upper l...,11,2017-12-04,125344.078149,14109.900048,0.0,121.0,High Cost (>$100k),4,"[ancillary, inpatient, outpatient, office-based]",56.72,104599.938008,0.0,4553.39998,16134.020161
2,13289,Active Oncology,Benign Neoplasms,"[C3492, D235]",[Malignant neoplasm of unspecified part of lef...,2,2018-03-14,124908.296561,25376.599994,0.0,316.0,High Cost (>$100k),4,"[ancillary, other, inpatient, outpatient, offi...",105751.616406,7037.020144,0.0,6691.85,5136.560011
3,11030,Active Oncology,,"[C801, D481]","[Malignant (primary) neoplasm, unspecified, Ne...",2,2017-11-23,116259.028908,18259.799973,0.0,202.0,High Cost (>$100k),4,"[outpatient, ancillary, inpatient, office-based]",110563.478892,1344.81,0.0,1016.96,3333.780016
4,11524,Active Oncology,Respiratory & Intrathoracic,"[C039, C153, C159, C3411, C3412, C3431, C3432,...","[Malignant neoplasm of gum, unspecified, Malig...",61,2018-07-16,96994.333126,24024.989993,0.0,194.0,Medium Cost ($25k-$100k),4,"[office-based, outpatient, inpatient, ancillary]",68322.322472,15595.090672,0.0,8593.509981,4483.410001
5,11540,Active Oncology,Benign Neoplasms,"[C61, D171]","[Malignant neoplasm of prostate, Benign lipoma...",2,2018-01-13,88478.769242,37622.549775,0.0,277.0,Medium Cost ($25k-$100k),4,"[office-based, inpatient, ancillary, other, ou...",33605.249728,35915.909784,0.0,8915.909983,10041.699747
6,11047,Active Oncology,Benign Neoplasms,"[D131, D1779]","[Benign neoplasm of stomach, Benign lipomatous...",3,2016-04-30,79383.76889,21126.550076,0.0,270.0,Medium Cost ($25k-$100k),4,"[office-based, outpatient, inpatient, ancillar...",53577.14888,12256.260076,0.0,5761.890016,6370.31995
7,12772,Active Oncology,Benign Neoplasms,"[C4330, C44719, C4492, D1801, D2262, D2362, D2...",[Malignant melanoma of unspecified part of fac...,11,2018-01-04,77391.59744,44432.989572,0.0,264.0,Medium Cost ($25k-$100k),4,"[ancillary, inpatient, outpatient, office-based]",55072.627504,7183.369924,0.0,12085.050028,3050.549984
8,12349,Active Oncology,,"[C44311, C4441, C4442, C44612, D045, D1801, D2...","[Basal cell carcinoma of skin of nose, Basal c...",19,2018-08-25,73754.810304,19155.380015,0.0,104.0,Medium Cost ($25k-$100k),4,"[office-based, inpatient, outpatient, ancillary]",48520.360608,10077.159664,0.0,13639.699984,1517.590048
9,10202,Active Oncology,,"[C44329, D2239]",[Squamous cell carcinoma of skin of other part...,2,2016-04-14,71897.941524,12054.099978,0.0,153.0,Medium Cost ($25k-$100k),4,"[office-based, inpatient, ancillary, outpatient]",7.04,66904.331512,0.0,2327.720008,2658.850004


In [19]:
# Close connection
con.close()