In [3]:
from cassandra.cluster import Cluster
from ssl import SSLContext, PROTOCOL_TLSv1_2, CERT_REQUIRED
from cassandra_sigv4.auth import SigV4AuthProvider
import boto3

# ssl setup
ssl_context = SSLContext(PROTOCOL_TLSv1_2)
ssl_context.load_verify_locations('/home/ubuntu/sf-class2-root.crt')  # change your file path for locating the certificate
ssl_context.verify_mode = CERT_REQUIRED

# boto3 session setup
boto_session = boto3.Session(region_name="us-east-2")  # this AWS credentials is specific to `us-east-2` region

  ssl_context = SSLContext(PROTOCOL_TLSv1_2)


In [4]:
# authorization setup with SigV4
auth_provider = SigV4AuthProvider(boto_session)

In [5]:
#cluster setup 
cluster = Cluster(['cassandra.us-east-2.amazonaws.com'], 
                  ssl_context=ssl_context, 
                  auth_provider=auth_provider, 
                  port=9142)  # TLS only communicates on port 9142

In [6]:
# establishing connection to Keyspace
session = cluster.connect()
# Insert any CQL queries between .connect() and .shutdown()

In [9]:
# For example, show all keyspaces created
r = session.execute('''
    SELECT * FROM system_schema.keyspaces;
    ''')
print(r.current_rows)

[Row(keyspace_name='system_schema', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '3')])), Row(keyspace_name='system_schema_mcs', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '3')])), Row(keyspace_name='system', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '3')])), Row(keyspace_name='system_multiregion_info', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '3')])), Row(keyspace_name='aet7207_hw2', durable_writes=True, replication=OrderedMapSerializedKey([('class', 'org.apache.cassandra.locator.SimpleStrategy'), ('replication_factor', '3')])), Row(keyspace_name='atb2199_ks', durable_writes=True, replication=OrderedMapSeri

In [8]:
# For example, create a keyspace for HW2
r = session.execute('''
    CREATE KEYSPACE IF NOT EXISTS csn4634_hw2 
    WITH replication = {'class': 'SingleRegionStrategy'};
    ''')
print(r.current_rows)

[]


In [10]:
from cassandra.cluster import ExecutionProfile, EXEC_PROFILE_DEFAULT
from cassandra import ConsistencyLevel


# Define execution profile with LOCAL_QUORUM
execution_profile = ExecutionProfile(
    consistency_level=ConsistencyLevel.LOCAL_QUORUM
)

# Cluster setup with correct profile
cluster = Cluster(
    ['cassandra.us-east-2.amazonaws.com'],
    ssl_context=ssl_context,
    auth_provider=auth_provider,
    port=9142,
    execution_profiles={EXEC_PROFILE_DEFAULT: execution_profile}
)

# establishing connection to Keyspace
session = cluster.connect()
session.set_keyspace('csn4634_hw2')  

In [52]:
# creating table
session.execute("""
CREATE TABLE IF NOT EXISTS q1 (
    ethnicity TEXT PRIMARY KEY,
    drug TEXT,
    count INT,
);
""")

<cassandra.cluster.ResultSet at 0x77e8d61ec860>

In [28]:
# time to fill table
import pandas as pd
pres = pd.read_csv("Data/PRESCRIPTIONS.csv")
ads = pd.read_csv("Data/ADMISSIONS.csv")

combo = pd.merge(pres, ads[['subject_id', 'hadm_id', 'ethnicity']], on=['subject_id', 'hadm_id'], how = 'inner')
counts = combo.groupby(['ethnicity', 'drug']).size().reset_index(name = 'count')
top_counts = counts.sort_values(['ethnicity','count'], ascending = [True,False]).groupby('ethnicity').first().reset_index()

#insert row by row
for _, row in top_counts.iterrows():
    session.execute(
        """
        INSERT INTO q1 (ethnicity, drug, count)
        VALUES (%s, %s, %s)
        """,
        (row['ethnicity'], row['drug'], int(row['count']))
    )


In [26]:
# now query
ethnicities =session.execute(""" SELECT * FROM q1 """)
for ethnicity in ethnicities:
    print("The most common drug for those of ethnicity {" + ethnicity.ethnicity + "} is {" + ethnicity.drug + "}")

The most common drug for those of ethnicity {OTHER} is {NS}
The most common drug for those of ethnicity {BLACK/AFRICAN AMERICAN} is {Insulin}
The most common drug for those of ethnicity {WHITE} is {Potassium Chloride}
The most common drug for those of ethnicity {ASIAN} is {D5W}
The most common drug for those of ethnicity {HISPANIC/LATINO - PUERTO RICAN} is {0.9% Sodium Chloride}
The most common drug for those of ethnicity {UNKNOWN/NOT SPECIFIED} is {D5W}
The most common drug for those of ethnicity {UNABLE TO OBTAIN} is {0.9% Sodium Chloride}
The most common drug for those of ethnicity {AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE} is {5% Dextrose}
The most common drug for those of ethnicity {HISPANIC OR LATINO} is {5% Dextrose}


In [157]:
# create table
session.set_keyspace('csn4634_hw2')
session.execute("""
CREATE TABLE IF NOT EXISTS q2 (
    age_range TEXT,
    icd9_code TEXT,
    count INT,
    PRIMARY KEY (age_range, icd9_code)
);
""")

<cassandra.cluster.ResultSet at 0x77e8c9dcaa50>

In [158]:
# add age to admissions
import pandas as pd

admissions = pd.read_csv('Data/ADMISSIONS.csv')
patients = pd.read_csv('Data/PATIENTS.csv')
d_icd_procedures = pd.read_csv('Data/D_ICD_PROCEDURES.csv')
procedures_icd = pd.read_csv('Data/PROCEDURES_ICD.csv')

admissions = admissions.merge(patients[['subject_id', 'dob']], on='subject_id', how='left')
def getYear(x):
    x = pd.to_datetime(x)
    return x.year
admissions['age'] = admissions['admittime'].apply(getYear) - admissions['dob'].apply(getYear)



In [187]:
def categorize_age(age):
    if age <= 19:
        return '<=19'
    elif 20 <= age <= 49:
        return '20-49'
    elif 50 <= age <= 79:
        return '50-79'
    else:
        return '>=80'

admissions['age_range'] = admissions['age'].apply(categorize_age)

# Step 3: Merge with procedures_icd to get procedure details
admissions_procedures = admissions.merge(procedures_icd[['subject_id', 'icd9_code', 'hadm_id']], on=['subject_id', 'hadm_id'], how='left')

# Count the occurrences of each procedure by age group
age_group_procedure_count = admissions_procedures.groupby(['age_range', 'icd9_code']).size().reset_index(name='count')

# Step 4: Get the top 3 procedures for each age group
top_procedures = age_group_procedure_count.groupby('age_range').apply(lambda x: x.nlargest(3, 'count')).reset_index(drop=True)
top_procedures['icd9_code'] = top_procedures['icd9_code'].apply(lambda x: str(int(x)) if isinstance(x, float) else str(x))

# Step 6: Insert the data into Cassandra table
insert_query = """
    INSERT INTO q2 (age_range, icd9_code, count)
    VALUES (%s, %s, %s)
"""
session.set_keyspace('csn4634_hw2')

for index, row in top_procedures.iterrows():
    session.execute(insert_query, (row['age_range'], row['icd9_code'], row['count']))


  top_procedures = age_group_procedure_count.groupby('age_range').apply(lambda x: x.nlargest(3, 'count')).reset_index(drop=True)


In [188]:
# Load the d_icd_procedures.csv to get procedure names
d_icd_procedures = pd.read_csv('Data/D_ICD_PROCEDURES.csv')

# Make sure codes are strings and clean
d_icd_procedures['icd9_code'] = d_icd_procedures['icd9_code'].astype(str).str.strip()

# Build a dictionary for fast lookup
procedure_dict = dict(zip(d_icd_procedures['icd9_code'], d_icd_procedures['short_title']))


age_ranges = ['<=19', '20-49', '50-79', '>=80']

for age_range in age_ranges:
    result = session.execute("""
        SELECT icd9_code, count
        FROM q2
        WHERE age_range = %s
    """, (age_range,))

    
    # Sort the results by 'count' to get the top 3 procedures
    sorted_result = sorted(result, key=lambda row: row.count, reverse=True)[:3]
    
    print(f"Top 3 Procedures for Age Range {age_range}:")
    for row in sorted_result:
        icd9_code = row.icd9_code
        procedure_name = d_icd_procedures[d_icd_procedures['icd9_code'] == icd9_code]['short_title'].iloc[0]
        print(f"ICD9 Code: {icd9_code}, Procedure Name: {procedure_name}, Count: {row.count}")
    print()


Top 3 Procedures for Age Range <=19:
ICD9 Code: 3893, Procedure Name: Venous cath NEC, Count: 2
ICD9 Code: 311, Procedure Name: Temporary tracheostomy, Count: 1
ICD9 Code: 331, Procedure Name: Spinal tap, Count: 1

Top 3 Procedures for Age Range 20-49:
ICD9 Code: 3893, Procedure Name: Venous cath NEC, Count: 9
ICD9 Code: 9604, Procedure Name: Insert endotracheal tube, Count: 9
ICD9 Code: 966, Procedure Name: Entral infus nutrit sub, Count: 7

Top 3 Procedures for Age Range 50-79:
ICD9 Code: 9604, Procedure Name: Insert endotracheal tube, Count: 51
ICD9 Code: 3893, Procedure Name: Venous cath NEC, Count: 25
ICD9 Code: 966, Procedure Name: Entral infus nutrit sub, Count: 22

Top 3 Procedures for Age Range >=80:
ICD9 Code: 3893, Procedure Name: Venous cath NEC, Count: 20
ICD9 Code: 9904, Procedure Name: Packed cell transfusion, Count: 13
ICD9 Code: 9604, Procedure Name: Insert endotracheal tube, Count: 8



In [179]:
d_icd_procedures.describe()

Unnamed: 0,row_id,icd9_code
count,3882.0,3882.0
mean,1941.5,5466.241628
std,1120.781201,3183.187486
min,1.0,1.0
25%,971.25,2837.0
50%,1941.5,5672.5
75%,2911.75,8382.75
max,3882.0,9999.0


In [221]:
#make table
session.execute("DROP TABLE IF EXISTS q3")

session.execute("""
CREATE TABLE IF NOT EXISTS q3(
    demographic TEXT,
    time_in_icu DOUBLE,
    PRIMARY KEY (demographic)
);
""")

<cassandra.cluster.ResultSet at 0x77e8c9092420>

In [227]:
# fill table
icustays = pd.read_csv('Data/ICUSTAYS.csv')
patients = pd.read_csv('Data/PATIENTS.csv')

merged = icustays.merge(patients[['subject_id', 'gender']], on='subject_id', how='left')
merged['intime'] = pd.to_datetime(merged['intime'])
merged['outtime'] = pd.to_datetime(merged['outtime'])

merged['time_in_icu'] = (merged['outtime'] - merged['intime']).dt.days

insert = """
    INSERT INTO q3 (demographic, time_in_icu)
    VALUES (%s, %s)
"""
per_gender = merged.groupby('gender')['time_in_icu'].mean()

session.execute(insert, ('M', per_gender[0]))
session.execute(insert, ('F', per_gender[1]))

admissions = pd.read_csv('Data/ADMISSIONS.csv')

merged = icustays.merge(admissions[['subject_id', 'ethnicity']], on='subject_id', how='left')
merged['intime'] = pd.to_datetime(merged['intime'])
merged['outtime'] = pd.to_datetime(merged['outtime'])

merged['time_in_icu'] = (merged['outtime'] - merged['intime']).dt.days

per_ethnicity = merged.groupby('ethnicity')['time_in_icu'].mean()
i = 0
for ethnic in merged['ethnicity'].unique():
    session.execute(insert, (ethnic, per_ethnicity[i]))
    i += 1

  session.execute(insert, ('M', per_gender[0]))
  session.execute(insert, ('F', per_gender[1]))
  session.execute(insert, (ethnic, per_ethnicity[i]))


In [238]:
#query
demographics = set(merged['ethnicity'].unique()) | ( set(patients['gender'].unique()))
for demo in demographics:
    query = "SELECT time_in_icu FROM q3 WHERE demographic = %s"
    result = session.execute(query, (demo,))
    for row in result:
        print(f"Mean ICU time for {demo} is {row.time_in_icu}")

Mean ICU time for ASIAN is 2.6
Mean ICU time for HISPANIC/LATINO - PUERTO RICAN is 13.0
Mean ICU time for UNKNOWN/NOT SPECIFIED is 3.5
Mean ICU time for WHITE is 6.333333333333333
Mean ICU time for HISPANIC OR LATINO is 0.3333333333333333
Mean ICU time for BLACK/AFRICAN AMERICAN is 11.0
Mean ICU time for AMERICAN INDIAN/ALASKA NATIVE FEDERALLY RECOGNIZED TRIBE is 3.6147540983606556
Mean ICU time for F is 3.0136986301369864
Mean ICU time for M is 5.015873015873016
Mean ICU time for UNABLE TO OBTAIN is 3.923076923076923
Mean ICU time for OTHER is 7.0
