Load all the data into a duckdb database. Refer to MIMIC-III example for guidance to create your own database.

In [1]:
import duckdb

# establish connection
conn = duckdb.connect('mimic.db', read_only=False)

conn.sql('SHOW TABLES;')


┌─────────┐
│  name   │
│ varchar │
├─────────┤
│ 0 rows  │
└─────────┘

In [3]:
import pandas as pd

df = pd.read_csv('Data/ADMISSIONS.csv')
df.to_sql("ADMISSIONS", conn, index=False)

df = pd.read_csv('Data/D_ICD_PROCEDURES.csv')
df.to_sql("D_ICD_PROCEDURES", conn, index=False)

df = pd.read_csv('Data/DRGCODES.csv')
df.to_sql("DRGCODES", conn, index=False)

df = pd.read_csv('Data/ICUSTAYS.csv')
df.to_sql("ICUSTAYS", conn, index=False)

df = pd.read_csv('Data/PATIENTS.csv')
df.to_sql("PATIENTS", conn, index=False)

df = pd.read_csv('Data/PRESCRIPTIONS.csv')
df.to_sql("PRESCRIPTIONS", conn, index=False)

df = pd.read_csv('Data/PROCEDURES_ICD.csv')
df.to_sql("PROCEDURES_ICD", conn, index=False)
conn.sql('SHOW TABLES')

  df.to_sql("ADMISSIONS", conn, index=False)
  df.to_sql("D_ICD_PROCEDURES", conn, index=False)
  df.to_sql("DRGCODES", conn, index=False)
  df.to_sql("ICUSTAYS", conn, index=False)
  df.to_sql("PATIENTS", conn, index=False)
  df.to_sql("PRESCRIPTIONS", conn, index=False)
  df.to_sql("PROCEDURES_ICD", conn, index=False)


┌──────────────────┐
│       name       │
│     varchar      │
├──────────────────┤
│ ADMISSIONS       │
│ DRGCODES         │
│ D_ICD_PROCEDURES │
│ ICUSTAYS         │
│ PATIENTS         │
│ PRESCRIPTIONS    │
│ PROCEDURES_ICD   │
└──────────────────┘

1) Create a summary of type of drugs and their total amount used by ethnicity. Report the top usage in each ethnicity group. You may have to make certain assumptions in calculating their total amount.

In [None]:
# prescriptions.drug_type & addmissions.ethnicity where subject_id and hadm_id are equal
# types are MAIN/BASE/ADDITIVE
query = conn.sql(
    """
    SELECT ADMISSIONS.ethnicity, 
           PRESCRIPTIONS.drug_type,
           count(*)
    FROM PRESCRIPTIONS
    JOIN ADMISSIONS
        ON PRESCRIPTIONS.subject_id = ADMISSIONS.subject_id
        AND PRESCRIPTIONS.hadm_id = ADMISSIONS.hadm_id
    GROUP BY ADMISSIONS.ethnicity, PRESCRIPTIONS.drug_type
    """
)
query


┌──────────────────────────────────────────────────────────┬───────────┬──────────────┐
│                        ethnicity                         │ drug_type │ count_star() │
│                         varchar                          │  varchar  │    int64     │
├──────────────────────────────────────────────────────────┼───────────┼──────────────┤
│ ASIAN                                                    │ MAIN      │          121 │
│ ASIAN                                                    │ BASE      │           56 │
│ AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE │ ADDITIVE  │            2 │
│ UNABLE TO OBTAIN                                         │ ADDITIVE  │            4 │
│ AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE │ MAIN      │          200 │
│ AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE │ BASE      │           80 │
│ UNABLE TO OBTAIN                                         │ MAIN      │           89 │
│ UNABLE TO OBTAIN              

In [36]:
df = query.df()

# AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGN... ==== MAIN
# ASIAN ==== MAIN
# BLACK?AFRICAN AMERICAN ==== MAIN
# HISPANIC OR LATINO ==== MAIN
# HISPANIC/LATINO - PUERTO RICAN ==== MAIN
# OTHER ==== MAIN
# UNABLE TO OBTAIN ==== MAIN
# UNKNOWN/NOT SPECIFIED ==== MAIN
# WHITE ==== MAIN

df.sort_values(by=["ethnicity", "count_star()"], ascending=[True, False])

Unnamed: 0,ethnicity,drug_type,count_star()
4,AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGN...,MAIN,200
5,AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGN...,BASE,80
2,AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGN...,ADDITIVE,2
0,ASIAN,MAIN,121
1,ASIAN,BASE,56
8,BLACK/AFRICAN AMERICAN,MAIN,476
9,BLACK/AFRICAN AMERICAN,BASE,169
13,HISPANIC OR LATINO,MAIN,226
14,HISPANIC OR LATINO,BASE,96
19,HISPANIC/LATINO - PUERTO RICAN,MAIN,860


2) Create a summary of procedures performed on patients by age groups (<=19, 20-49, 50-79, >80). Report the top three procedures, along with the name of the procedures, performed in each age group.

In [None]:
# start by adding age column to addmissions
conn.sql("""
        ALTER TABLE ADMISSIONS DROP COLUMN age;
        ALTER TABLE ADMISSIONS ADD COLUMN age INT;
        """)

conn.sql("""
        UPDATE ADMISSIONS
        SET age = LEAST((CAST(ADMISSIONS.admittime as date) - CAST(PATIENTS.dob as date))/365.25, 89)
        FROM PATIENTS
        WHERE ADMISSIONS.subject_id = PATIENTS.subject_id
        """)

conn.sql("""
        SELECT age
        FROM admissions
        """)


In [60]:
# <= 19 query
df = conn.sql(
    """
    SELECT D_ICD_PROCEDURES.short_title,
           count(*)
    FROM D_ICD_PROCEDURES
    JOIN PROCEDURES_ICD
        ON PROCEDURES_ICD.icd9_code = D_ICD_PROCEDURES.icd9_code
        JOIN ADMISSIONS
            ON PROCEDURES_ICD.subject_id = ADMISSIONS.subject_id
            AND PROCEDURES_ICD.hadm_id = ADMISSIONS.hadm_id
            AND ADMISSIONS.age <= 19
    GROUP BY D_ICD_PROCEDURES.short_title
    """
).df()
df.sort_values('count_star()', ascending=False).head(3)

Unnamed: 0,short_title,count_star()
4,Venous cath NEC,2
1,Cl fx reduc-femur,1
0,Incision of lung,1


In [62]:
# 20-49 query
df = conn.sql(
    """
    SELECT D_ICD_PROCEDURES.short_title,
           count(*)
    FROM D_ICD_PROCEDURES
    JOIN PROCEDURES_ICD
        ON PROCEDURES_ICD.icd9_code = D_ICD_PROCEDURES.icd9_code
        JOIN ADMISSIONS
            ON PROCEDURES_ICD.subject_id = ADMISSIONS.subject_id
            AND PROCEDURES_ICD.hadm_id = ADMISSIONS.hadm_id
            AND ADMISSIONS.age >= 20
            AND ADMISSIONS.age <= 49
    GROUP BY D_ICD_PROCEDURES.short_title
    """
).df()
df.sort_values('count_star()', ascending=False).head(3)

Unnamed: 0,short_title,count_star()
16,Venous cath NEC,9
23,Entral infus nutrit sub,7
4,Percu abdominal drainage,6


In [63]:
# 50-79 query
df = conn.sql(
    """
    SELECT D_ICD_PROCEDURES.short_title,
           count(*)
    FROM D_ICD_PROCEDURES
    JOIN PROCEDURES_ICD
        ON PROCEDURES_ICD.icd9_code = D_ICD_PROCEDURES.icd9_code
        JOIN ADMISSIONS
            ON PROCEDURES_ICD.subject_id = ADMISSIONS.subject_id
            AND PROCEDURES_ICD.hadm_id = ADMISSIONS.hadm_id
            AND ADMISSIONS.age >= 50
            AND ADMISSIONS.age <= 79
    GROUP BY D_ICD_PROCEDURES.short_title
    """
).df()
df.sort_values('count_star()', ascending=False).head(3)

Unnamed: 0,short_title,count_star()
27,Venous cath NEC,25
46,Entral infus nutrit sub,22
18,Packed cell transfusion,13


In [64]:
# >= 80 query
df = conn.sql(
    """
    SELECT D_ICD_PROCEDURES.short_title,
           count(*)
    FROM D_ICD_PROCEDURES
    JOIN PROCEDURES_ICD
        ON PROCEDURES_ICD.icd9_code = D_ICD_PROCEDURES.icd9_code
        JOIN ADMISSIONS
            ON PROCEDURES_ICD.subject_id = ADMISSIONS.subject_id
            AND PROCEDURES_ICD.hadm_id = ADMISSIONS.hadm_id
            AND ADMISSIONS.age >= 80
    GROUP BY D_ICD_PROCEDURES.short_title
    """
).df()
df.sort_values('count_star()', ascending=False).head(3)

Unnamed: 0,short_title,count_star()
20,Venous cath NEC,20
10,Packed cell transfusion,13
49,Insert endotracheal tube,8


3) How long do patients stay in the ICU? Is there a difference in the ICU length of stay among gender or ethnicity?

In [None]:
# create column to take days in ICU
# start by adding days column to addmissions
conn.sql("""
        ALTER TABLE ICUSTAYS DROP COLUMN days;
        ALTER TABLE ICUSTAYS ADD COLUMN days INT;
        """)

conn.sql("""
        UPDATE ICUSTAYS
        SET days = CAST(ICUSTAYS.outtime as date) - CAST(ICUSTAYS.intime as date)
        """)

conn.sql("""
        SELECT days
        FROM ICUSTAYS
        """)

In [79]:
# now lets take the days in icu and group by gender
conn.sql("""
        SELECT PATIENTS.gender,
              AVG(ICUSTAYS.days) AS average_stay_in_days
         FROM ICUSTAYS
         JOIN PATIENTS
              ON ICUSTAYS.subject_id = PATIENTS.subject_id
         GROUP BY PATIENTS.gender
         """)

┌─────────┬──────────────────────┐
│ gender  │ average_stay_in_days │
│ varchar │        double        │
├─────────┼──────────────────────┤
│ F       │    5.476190476190476 │
│ M       │   3.5205479452054793 │
└─────────┴──────────────────────┘

In [81]:
# now lets take the days in icu and group by ethnicity
conn.sql("""
        SELECT ADMISSIONS.ethnicity,
              AVG(ICUSTAYS.days) AS average_stay_in_days
         FROM ICUSTAYS
         JOIN ADMISSIONS
              ON ICUSTAYS.subject_id = ADMISSIONS.subject_id
         GROUP BY ADMISSIONS.ethnicity
         """)

┌──────────────────────────────────────────────────────────┬──────────────────────┐
│                        ethnicity                         │ average_stay_in_days │
│                         varchar                          │        double        │
├──────────────────────────────────────────────────────────┼──────────────────────┤
│ UNKNOWN/NOT SPECIFIED                                    │    4.461538461538462 │
│ ASIAN                                                    │                  4.0 │
│ BLACK/AFRICAN AMERICAN                                   │    6.888888888888889 │
│ OTHER                                                    │                  1.0 │
│ HISPANIC OR LATINO                                       │    7.333333333333333 │
│ UNABLE TO OBTAIN                                         │                 14.0 │
│ AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE │                 11.5 │
│ WHITE                                                    │    4.0245901639

In [82]:
conn.close()