In [44]:
# Initial imports
import numpy as np
import pandas as pd
import os
import sqlite3
import gzip
from sklearn.preprocessing import MinMaxScaler

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

## Transform data into SQL database

In [45]:
'''
# Define the paths to form the QSL database
base_path = "./" 
folders = ['hosp', 'icu', 'note']
db_path = "mimic.db"

# Connect to SQLite database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Iterate through each folder and process .csv.gz files
for folder in folders: 
    folder_path = os.path.join(base_path, folder)
    
    for file in os.listdir(folder_path):
        if file.endswith(".csv.gz"):
            file_path = os.path.join(folder_path, file)
            table_name = file.replace(".csv.gz", "")

            chunk_size = 1000000
            for chunk in pd.read_csv(file_path, compression='gzip', chunksize= chunk_size):
                chunk.to_sql(table_name, conn, if_exists='append', index=False)
            
            print(f'Finished processing {file} into table {table_name}')

# Close the connection
conn.close()
'''

'\n# Define the paths to form the QSL database\nbase_path = "./" \nfolders = [\'hosp\', \'icu\', \'note\']\ndb_path = "mimic.db"\n\n# Connect to SQLite database\nconn = sqlite3.connect(db_path)\ncursor = conn.cursor()\n\n# Iterate through each folder and process .csv.gz files\nfor folder in folders: \n    folder_path = os.path.join(base_path, folder)\n    \n    for file in os.listdir(folder_path):\n        if file.endswith(".csv.gz"):\n            file_path = os.path.join(folder_path, file)\n            table_name = file.replace(".csv.gz", "")\n\n            chunk_size = 1000000\n            for chunk in pd.read_csv(file_path, compression=\'gzip\', chunksize= chunk_size):\n                chunk.to_sql(table_name, conn, if_exists=\'append\', index=False)\n            \n            print(f\'Finished processing {file} into table {table_name}\')\n\n# Close the connection\nconn.close()\n'

## Dataset Paper

In [46]:
# Database path
db_path= '../database/mimic.db'

#### Dataset Creation

5 tables:
-   Admissions
-   Patients
-   Diagnoses: diagnoses_icd + d_icd_diagnoses
-   prescriptions

STEPS:
1. extract the data: extract data from 5 tables, select patients admitted twice, filter medications and diagnoses
2. data preprocessing: merge tables (diagnoses tables, then on admission table, then prescriptions with admission), feature extraction
3. feature engineering

##### Data Extraction

**ADMISSIONS**

In [47]:
# Connect to the SQLite database to query the files
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

In [48]:
# Patients
patients = pd.read_sql_query("""SELECT subject_id, gender, anchor_age FROM patients""", conn)

# LAdmissions - filtered by patients' number of admissions
admissions = pd.read_sql_query("""SELECT subject_id, hadm_id, admittime, dischtime FROM admissions WHERE subject_id IN (SELECT subject_id FROM admissions GROUP BY subject_id HAVING COUNT(hadm_id) = 2) ORDER BY subject_id""", conn)


In [49]:
# Merge admissions with patients
admissions_patients = admissions.merge(patients, on= 'subject_id', how='left')
display(admissions_patients.head())
print(admissions_patients['subject_id'].nunique())

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72
1,10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,M,72
2,10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,F,48
3,10000117,27988844,2183-09-18 18:10:00,2183-09-21 16:30:00,F,48
4,10000883,25221576,2124-05-14 21:11:00,2124-05-22 10:40:00,M,20


35712


**DIAGNOSES**

d_icd_diagnoses table + icd_diagnoses table

Steps: merge the two tables based on ICD code and ICD version, allowing to match the diagnoses information from the two diagnoses_icd with the corresponding detailed diagnoses from d_icd diagnoses. Identify and filter the table according to the most frequent diagnoses
 
-   icd_code refers to a specific diagnoses
-   icd_version specifies which ICD revision the code belongs to



In [50]:
# Load the d_icd_diagnoses into a database
query_d_icd_diagnoses = """SELECT * FROM d_icd_diagnoses;"""
d_icd_diagnoses_df = pd.read_sql_query(query_d_icd_diagnoses, conn)

# Load the diagnoses_icd into a database
query_diagnoses_icd = "SELECT * FROM diagnoses_icd;"
diagnoses_icd_df = pd.read_sql_query(query_diagnoses_icd, conn)

# 1. Merge the two databases into a diagnoses database
diagnoses = diagnoses_icd_df.merge(d_icd_diagnoses_df, on=['icd_code', 'icd_version'], how = 'inner')

# 2. Count the frequent diagnoses
frequent_diagnoses = (
    diagnoses_icd_df.groupby(['icd_code', 'icd_version'], as_index=False)
    .size()
    .rename(columns={'size': 'count'})
)
frequent_diagnoses = frequent_diagnoses[frequent_diagnoses['count'] >= 10000]   # filter frequent diagnoses

# 3. Filter the diagnoses table with the frequent diagnoses
diagnoses_df = diagnoses.merge(frequent_diagnoses[['icd_code', 'icd_version']], on=['icd_code', 'icd_version'], how = 'inner')
display(diagnoses_df.head())

Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,long_title
0,10000032,22595853,5,496,9,"Chronic airway obstruction, not elsewhere clas..."
1,10000032,22595853,8,V1582,9,Personal history of tobacco use
2,10000032,22841357,4,2761,9,Hyposmolality and/or hyponatremia
3,10000032,22841357,5,496,9,"Chronic airway obstruction, not elsewhere clas..."
4,10000032,22841357,8,3051,9,Tobacco use disorder


**PRESCRIPTIONS**

Steps: identify medications with the same drug name but different done measuring units and filter for medications with consistent units, and high-frequency use (>= 1000 uses across patients)

In [51]:
# Define the SQL query
'''query = """
SELECT COUNT(*) AS total_drugs_with_multiple_units
FROM (
    SELECT drug
    FROM prescriptions
    GROUP BY drug
    HAVING COUNT(DISTINCT LOWER(dose_unit_rx)) > 1
);
"""
# Execute the query and fetch the results
cursor.execute(query)
result = cursor.fetchone()

# Print the total number of drugs with multiple dose units
print(f"Total drugs with multiple dose units: {result[0]}")
'''

'query = """\nSELECT COUNT(*) AS total_drugs_with_multiple_units\nFROM (\n    SELECT drug\n    FROM prescriptions\n    GROUP BY drug\n    HAVING COUNT(DISTINCT LOWER(dose_unit_rx)) > 1\n);\n"""\n# Execute the query and fetch the results\ncursor.execute(query)\nresult = cursor.fetchone()\n\n# Print the total number of drugs with multiple dose units\nprint(f"Total drugs with multiple dose units: {result[0]}")\n'

This figure is different from the paper. Where in they found same medicine with different units to be 474. We found it to be 1822. The change wouldn't matter since we will only be using most frequently used 68 medications later.

In [11]:
# Load the prescriptions table into a dataframe
query_prescriptions = """SELECT subject_id, hadm_id, drug, dose_val_rx, dose_unit_rx FROM prescriptions;"""
prescriptions = pd.read_sql_query(query_prescriptions, conn)

In [12]:
# Convert "drug" and "dose_unit_rx" into lowercase
prescriptions['drug'] = prescriptions['drug'].str.lower()
prescriptions['dose_unit_rx'] = prescriptions['dose_unit_rx'].str.lower()

# Filter drugs with consistent drug units
valid_drugs = prescriptions.groupby('drug')['dose_unit_rx'].nunique()
consistent_drugs = valid_drugs[valid_drugs == 1].index  # drugs with only one single unit
consistent_presc_df = prescriptions[prescriptions['drug'].isin(consistent_drugs)]  # filter the dataset

# Remove the low-frequency drugs 
drug_counts = consistent_presc_df['drug'].value_counts()
frequent_drugs = drug_counts[drug_counts >= 1000].index  # Only drugs with at least 1000 occurrences

# final dataset
prescriptions_df = consistent_presc_df[consistent_presc_df['drug'].isin(frequent_drugs)]
prescriptions_df.head()

Unnamed: 0,subject_id,hadm_id,drug,dose_val_rx,dose_unit_rx
1,10000032,22595853,sodium chloride 0.9% flush,3.0,ml
2,10000032,22595853,furosemide,40.0,mg
3,10000032,22595853,raltegravir,400.0,mg
10,10000032,22595853,influenza vaccine quadrivalent,0.5,ml
13,10000032,22595853,furosemide,20.0,mg


In [13]:
conn.close()

##### Data pre-processing

In [14]:
# 3 datasets to use
display(admissions_patients.head(5))
display(diagnoses_df.head(3))
display(prescriptions_df.head(3))

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72
1,10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,M,72
2,10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,F,48
3,10000117,27988844,2183-09-18 18:10:00,2183-09-21 16:30:00,F,48
4,10000883,25221576,2124-05-14 21:11:00,2124-05-22 10:40:00,M,20


Unnamed: 0,subject_id,hadm_id,seq_num,icd_code,icd_version,long_title
0,10000032,22595853,5,496,9,"Chronic airway obstruction, not elsewhere clas..."
1,10000032,22595853,8,V1582,9,Personal history of tobacco use
2,10000032,22841357,4,2761,9,Hyposmolality and/or hyponatremia


Unnamed: 0,subject_id,hadm_id,drug,dose_val_rx,dose_unit_rx
1,10000032,22595853,sodium chloride 0.9% flush,3,ml
2,10000032,22595853,furosemide,40,mg
3,10000032,22595853,raltegravir,400,mg


In [15]:
# Create a copy of each df to have a copy of the original  data
adm_df = admissions_patients.copy()
diag_df = diagnoses_df.copy()
prescr_df = prescriptions_df.copy()

**ADMISSIONS**

In [16]:
# Calculate length of stay and add the number of stay
adm_df['length_of_stay'] = (pd.to_datetime(adm_df['dischtime']) - pd.to_datetime(adm_df['admittime'])).dt.days
adm_df['stay'] = adm_df.groupby('subject_id')['hadm_id'].transform(lambda x: x.rank(method='dense'))
adm_df.head(4)

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age,length_of_stay,stay
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,4,1.0
1,10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,M,72,0,2.0
2,10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,F,48,0,1.0
3,10000117,27988844,2183-09-18 18:10:00,2183-09-21 16:30:00,F,48,2,2.0


In [17]:
# Column for first LOS 
first_stay_lengths = adm_df[adm_df['stay'] == 1].groupby('subject_id')['length_of_stay'].first().reset_index()
first_stay_lengths = first_stay_lengths.rename(columns={'length_of_stay': 'lengths_of_1st_admission'})

# Column for second LOS
second_stay_lengths = adm_df[adm_df['stay'] == 2].groupby('subject_id')['length_of_stay'].first().reset_index()
second_stay_lengths = second_stay_lengths.rename(columns={'length_of_stay': 'lengths_of_2nd_admission'})

# Merge 
stays = first_stay_lengths.merge(second_stay_lengths, on='subject_id')
display(first_stay_lengths.head())
display(second_stay_lengths.head())
display(stays.head())

Unnamed: 0,subject_id,lengths_of_1st_admission
0,10000084,4
1,10000117,0
2,10000883,7
3,10001217,6
4,10001877,1


Unnamed: 0,subject_id,lengths_of_2nd_admission
0,10000084,0
1,10000117,2
2,10000883,0
3,10001217,5
4,10001877,5


Unnamed: 0,subject_id,lengths_of_1st_admission,lengths_of_2nd_admission
0,10000084,4,0
1,10000117,0,2
2,10000883,7,0
3,10001217,6,5
4,10001877,1,5


In [18]:
# Merge the stays with the admissions dataframe
adm_df = adm_df.merge(stays, on='subject_id')
adm_df = adm_df.drop(columns =['length_of_stay', 'stay'])
adm_df.head(4)

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age,lengths_of_1st_admission,lengths_of_2nd_admission
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,4,0
1,10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,M,72,4,0
2,10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,F,48,0,2
3,10000117,27988844,2183-09-18 18:10:00,2183-09-21 16:30:00,F,48,0,2


In [19]:
# Set the target variable "lengths_of_2nd_admission" as binary for classification
adm_df['lengths_of_2nd_admission'] = (adm_df['lengths_of_2nd_admission'] >= 3).astype(int)

**DIAGNOSES**

Create new columns with the top 81 diagnoses.

In [20]:
diag_df['long_title'].nunique()

62

There are only 62 unique diagnoses in our dataset. Therefore, we will be using all the diagnoses present.

In [21]:
# One-hot encode diagnoses per subject_id
diagnosis_pivot = diag_df.pivot_table(index='subject_id', columns='long_title', aggfunc='size', fill_value=0)
diagnosis_pivot = (diagnosis_pivot > 0).astype(int)  # set as binary values
diagnosis_pivot.reset_index(inplace=True)
diagnosis_pivot.head()

long_title,subject_id,"Acute kidney failure, unspecified",Acute posthemorrhagic anemia,"Alcohol abuse, unspecified","Anemia, unspecified","Anxiety disorder, unspecified","Anxiety state, unspecified",Aortocoronary bypass status,"Asthma, unspecified type, unspecified",Atherosclerotic heart disease of native coronary artery without angina pectoris,Atrial fibrillation,"Chronic airway obstruction, not elsewhere classified","Chronic kidney disease, unspecified","Chronic obstructive pulmonary disease, unspecified","Congestive heart failure, unspecified",Coronary atherosclerosis of native coronary artery,"Coronary atherosclerosis of unspecified type of vessel, native or graft",Dehydration,"Depressive disorder, not elsewhere classified","Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled",Do not resuscitate,Do not resuscitate status,Esophageal reflux,Essential (primary) hypertension,Gastro-esophageal reflux disease without esophagitis,"Gout, unspecified","Hyperlipidemia, unspecified","Hypertensive chronic kidney disease with stage 1 through stage 4 chronic kidney disease, or unspecified chronic kidney disease","Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage I through stage IV, or unspecified",Hyposmolality and/or hyponatremia,"Hypothyroidism, unspecified",Long term (current) use of anticoagulants,Long term (current) use of antithrombotics/antiplatelets,Long term (current) use of insulin,Long-term (current) use of anticoagulants,Long-term (current) use of aspirin,Long-term (current) use of insulin,"Major depressive disorder, single episode, unspecified","Nicotine dependence, cigarettes, uncomplicated","Obesity, unspecified",Obstructive sleep apnea (adult) (pediatric),Obstructive sleep apnea (adult)(pediatric),Old myocardial infarction,"Osteoporosis, unspecified",Other and unspecified hyperlipidemia,Other chronic pain,"Outcome of delivery, single liveborn",Percutaneous transluminal coronary angioplasty status,Personal history of nicotine dependence,Personal history of tobacco use,"Personal history of transient ischemic attack (TIA), and cerebral infarction without residual deficits",Personal history of venous thrombosis and embolism,"Pneumonia, organism unspecified",Pure hypercholesterolemia,Tobacco use disorder,Type 2 diabetes mellitus with diabetic chronic kidney disease,Type 2 diabetes mellitus without complications,Unspecified acquired hypothyroidism,"Unspecified asthma, uncomplicated",Unspecified atrial fibrillation,Unspecified essential hypertension,Unspecified place or not applicable,"Urinary tract infection, site not specified"
0,10000032,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
1,10000068,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,10000084,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,10000117,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,10000248,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [22]:
diagnosis_pivot.shape

(154951, 63)

**PRESCRIPTIONS**

Create new 68 columns of the 68 top-most frequent medications with the sum and average doses per patients. 

In [23]:
prescr_df['drug'].nunique() 

286

In [24]:
# Ensure the dose column is numeric
prescr_df['dose_val_rx'] = pd.to_numeric(prescr_df['dose_val_rx'], errors='coerce')

In [25]:
# select and filter the 68 medications
top_medications = prescr_df['drug'].value_counts().head(68).index
filtered_med_df = prescr_df[prescr_df['drug'].isin(top_medications)]

In [26]:
# Pivot table for sum of doses per drug
sum_dose_pivot = filtered_med_df.pivot_table(index='subject_id', columns='drug', values='dose_val_rx', aggfunc='sum', fill_value=0)
sum_dose_pivot.columns = [f'{col} Sum' for col in sum_dose_pivot.columns]  # Rename columns

# Pivot table for mean dose per drug
avg_dose_pivot = filtered_med_df.pivot_table(index='subject_id', columns='drug', values='dose_val_rx', aggfunc='mean', fill_value=0)
avg_dose_pivot.columns = [f'{col} Average' for col in avg_dose_pivot.columns]  # Rename columns

display(sum_dose_pivot.head())
display(avg_dose_pivot.head())

Unnamed: 0_level_0,0.45% sodium chloride Sum,1/2 ns Sum,allopurinol Sum,alprazolam Sum,amiodarone Sum,amlodipine Sum,aspirin Sum,aspirin ec Sum,benzonatate Sum,bisacodyl Sum,captopril Sum,carvedilol Sum,ciprofloxacin iv Sum,clonazepam Sum,d5 1/2ns Sum,d5ns Sum,diltiazem Sum,diltiazem extended-release Sum,famotidine Sum,fluoxetine Sum,furosemide Sum,glucagon Sum,hydrochlorothiazide Sum,ibuprofen Sum,influenza vaccine quadrivalent Sum,influenza virus vaccine Sum,isosorbide mononitrate (extended release) Sum,labetalol Sum,lamotrigine Sum,lidocaine 1% (for picc/midline insertions) Sum,lisinopril Sum,losartan potassium Sum,lr Sum,magnesium oxide Sum,metformin (glucophage) Sum,metoprolol succinate xl Sum,midazolam Sum,mirtazapine Sum,mycophenolate mofetil Sum,neomycin-polymyxin-bacitracin Sum,nitroglycerin sl Sum,olanzapine Sum,olanzapine (disintegrating tablet) Sum,ondansetron Sum,ondansetron odt Sum,oxycodone sr (oxycontin) Sum,pneumococcal 23-valent polysaccharide vaccine Sum,pneumococcal vac polyvalent Sum,potassium chl 20 meq / 1000 ml d5 1/2 ns Sum,potassium chloride (powder) Sum,potassium chloride replacement (critical care and oncology) Sum,potassium chloride replacement (oncology) Sum,pravastatin Sum,prochlorperazine Sum,quetiapine fumarate Sum,ramelteon Sum,ranitidine Sum,rosuvastatin calcium Sum,sodium chloride 0.9% flush Sum,soln Sum,sterile water Sum,sw Sum,tamsulosin Sum,torsemide Sum,tramadol Sum,trazodone Sum,valsartan Sum,zolpidem tartrate Sum
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1
10000032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,2.5
10000084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000560,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Unnamed: 0_level_0,0.45% sodium chloride Average,1/2 ns Average,allopurinol Average,alprazolam Average,amiodarone Average,amlodipine Average,aspirin Average,aspirin ec Average,benzonatate Average,bisacodyl Average,captopril Average,carvedilol Average,ciprofloxacin iv Average,clonazepam Average,d5 1/2ns Average,d5ns Average,diltiazem Average,diltiazem extended-release Average,famotidine Average,fluoxetine Average,furosemide Average,glucagon Average,hydrochlorothiazide Average,ibuprofen Average,influenza vaccine quadrivalent Average,influenza virus vaccine Average,isosorbide mononitrate (extended release) Average,labetalol Average,lamotrigine Average,lidocaine 1% (for picc/midline insertions) Average,lisinopril Average,losartan potassium Average,lr Average,magnesium oxide Average,metformin (glucophage) Average,metoprolol succinate xl Average,midazolam Average,mirtazapine Average,mycophenolate mofetil Average,neomycin-polymyxin-bacitracin Average,nitroglycerin sl Average,olanzapine Average,olanzapine (disintegrating tablet) Average,ondansetron Average,ondansetron odt Average,oxycodone sr (oxycontin) Average,pneumococcal 23-valent polysaccharide vaccine Average,pneumococcal vac polyvalent Average,potassium chl 20 meq / 1000 ml d5 1/2 ns Average,potassium chloride (powder) Average,potassium chloride replacement (critical care and oncology) Average,potassium chloride replacement (oncology) Average,pravastatin Average,prochlorperazine Average,quetiapine fumarate Average,ramelteon Average,ranitidine Average,rosuvastatin calcium Average,sodium chloride 0.9% flush Average,soln Average,sterile water Average,sw Average,tamsulosin Average,torsemide Average,tramadol Average,trazodone Average,valsartan Average,zolpidem tartrate Average
subject_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1
10000032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,2.5
10000084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10000560,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# Merge the two dose doses datasets
med_pivot = sum_dose_pivot.merge(avg_dose_pivot, on='subject_id', how='left')
med_pivot.reset_index(inplace=True)
med_pivot.head()

Unnamed: 0,subject_id,0.45% sodium chloride Sum,1/2 ns Sum,allopurinol Sum,alprazolam Sum,amiodarone Sum,amlodipine Sum,aspirin Sum,aspirin ec Sum,benzonatate Sum,bisacodyl Sum,captopril Sum,carvedilol Sum,ciprofloxacin iv Sum,clonazepam Sum,d5 1/2ns Sum,d5ns Sum,diltiazem Sum,diltiazem extended-release Sum,famotidine Sum,fluoxetine Sum,furosemide Sum,glucagon Sum,hydrochlorothiazide Sum,ibuprofen Sum,influenza vaccine quadrivalent Sum,influenza virus vaccine Sum,isosorbide mononitrate (extended release) Sum,labetalol Sum,lamotrigine Sum,lidocaine 1% (for picc/midline insertions) Sum,lisinopril Sum,losartan potassium Sum,lr Sum,magnesium oxide Sum,metformin (glucophage) Sum,metoprolol succinate xl Sum,midazolam Sum,mirtazapine Sum,mycophenolate mofetil Sum,neomycin-polymyxin-bacitracin Sum,nitroglycerin sl Sum,olanzapine Sum,olanzapine (disintegrating tablet) Sum,ondansetron Sum,ondansetron odt Sum,oxycodone sr (oxycontin) Sum,pneumococcal 23-valent polysaccharide vaccine Sum,pneumococcal vac polyvalent Sum,potassium chl 20 meq / 1000 ml d5 1/2 ns Sum,potassium chloride (powder) Sum,potassium chloride replacement (critical care and oncology) Sum,potassium chloride replacement (oncology) Sum,pravastatin Sum,prochlorperazine Sum,quetiapine fumarate Sum,ramelteon Sum,ranitidine Sum,rosuvastatin calcium Sum,sodium chloride 0.9% flush Sum,soln Sum,sterile water Sum,sw Sum,tamsulosin Sum,torsemide Sum,tramadol Sum,trazodone Sum,valsartan Sum,zolpidem tartrate Sum,0.45% sodium chloride Average,1/2 ns Average,allopurinol Average,alprazolam Average,amiodarone Average,amlodipine Average,aspirin Average,aspirin ec Average,benzonatate Average,bisacodyl Average,captopril Average,carvedilol Average,ciprofloxacin iv Average,clonazepam Average,d5 1/2ns Average,d5ns Average,diltiazem Average,diltiazem extended-release Average,famotidine Average,fluoxetine Average,furosemide Average,glucagon Average,hydrochlorothiazide Average,ibuprofen Average,influenza vaccine quadrivalent Average,influenza virus vaccine Average,isosorbide mononitrate (extended release) Average,labetalol Average,lamotrigine Average,lidocaine 1% (for picc/midline insertions) Average,lisinopril Average,losartan potassium Average,lr Average,magnesium oxide Average,metformin (glucophage) Average,metoprolol succinate xl Average,midazolam Average,mirtazapine Average,mycophenolate mofetil Average,neomycin-polymyxin-bacitracin Average,nitroglycerin sl Average,olanzapine Average,olanzapine (disintegrating tablet) Average,ondansetron Average,ondansetron odt Average,oxycodone sr (oxycontin) Average,pneumococcal 23-valent polysaccharide vaccine Average,pneumococcal vac polyvalent Average,potassium chl 20 meq / 1000 ml d5 1/2 ns Average,potassium chloride (powder) Average,potassium chloride replacement (critical care and oncology) Average,potassium chloride replacement (oncology) Average,pravastatin Average,prochlorperazine Average,quetiapine fumarate Average,ramelteon Average,ranitidine Average,rosuvastatin calcium Average,sodium chloride 0.9% flush Average,soln Average,sterile water Average,sw Average,tamsulosin Average,torsemide Average,tramadol Average,trazodone Average,valsartan Average,zolpidem tartrate Average
0,10000032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,2.5
1,10000084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000117,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10000248,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10000560,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,2000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [28]:
med_pivot.shape  # 137 = 68 * 2 + 1 (subject_id)

(157693, 137)

##### Merge the datasets

In [29]:
# 3 datasets we need to use
display(adm_df.head(2))
display(diagnosis_pivot.head(2))
display(med_pivot.head(2))

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age,lengths_of_1st_admission,lengths_of_2nd_admission
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,4,0
1,10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,M,72,4,0


long_title,subject_id,"Acute kidney failure, unspecified",Acute posthemorrhagic anemia,"Alcohol abuse, unspecified","Anemia, unspecified","Anxiety disorder, unspecified","Anxiety state, unspecified",Aortocoronary bypass status,"Asthma, unspecified type, unspecified",Atherosclerotic heart disease of native coronary artery without angina pectoris,Atrial fibrillation,"Chronic airway obstruction, not elsewhere classified","Chronic kidney disease, unspecified","Chronic obstructive pulmonary disease, unspecified","Congestive heart failure, unspecified",Coronary atherosclerosis of native coronary artery,"Coronary atherosclerosis of unspecified type of vessel, native or graft",Dehydration,"Depressive disorder, not elsewhere classified","Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled",Do not resuscitate,Do not resuscitate status,Esophageal reflux,Essential (primary) hypertension,Gastro-esophageal reflux disease without esophagitis,"Gout, unspecified","Hyperlipidemia, unspecified","Hypertensive chronic kidney disease with stage 1 through stage 4 chronic kidney disease, or unspecified chronic kidney disease","Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage I through stage IV, or unspecified",Hyposmolality and/or hyponatremia,"Hypothyroidism, unspecified",Long term (current) use of anticoagulants,Long term (current) use of antithrombotics/antiplatelets,Long term (current) use of insulin,Long-term (current) use of anticoagulants,Long-term (current) use of aspirin,Long-term (current) use of insulin,"Major depressive disorder, single episode, unspecified","Nicotine dependence, cigarettes, uncomplicated","Obesity, unspecified",Obstructive sleep apnea (adult) (pediatric),Obstructive sleep apnea (adult)(pediatric),Old myocardial infarction,"Osteoporosis, unspecified",Other and unspecified hyperlipidemia,Other chronic pain,"Outcome of delivery, single liveborn",Percutaneous transluminal coronary angioplasty status,Personal history of nicotine dependence,Personal history of tobacco use,"Personal history of transient ischemic attack (TIA), and cerebral infarction without residual deficits",Personal history of venous thrombosis and embolism,"Pneumonia, organism unspecified",Pure hypercholesterolemia,Tobacco use disorder,Type 2 diabetes mellitus with diabetic chronic kidney disease,Type 2 diabetes mellitus without complications,Unspecified acquired hypothyroidism,"Unspecified asthma, uncomplicated",Unspecified atrial fibrillation,Unspecified essential hypertension,Unspecified place or not applicable,"Urinary tract infection, site not specified"
0,10000032,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0
1,10000068,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,subject_id,0.45% sodium chloride Sum,1/2 ns Sum,allopurinol Sum,alprazolam Sum,amiodarone Sum,amlodipine Sum,aspirin Sum,aspirin ec Sum,benzonatate Sum,bisacodyl Sum,captopril Sum,carvedilol Sum,ciprofloxacin iv Sum,clonazepam Sum,d5 1/2ns Sum,d5ns Sum,diltiazem Sum,diltiazem extended-release Sum,famotidine Sum,fluoxetine Sum,furosemide Sum,glucagon Sum,hydrochlorothiazide Sum,ibuprofen Sum,influenza vaccine quadrivalent Sum,influenza virus vaccine Sum,isosorbide mononitrate (extended release) Sum,labetalol Sum,lamotrigine Sum,lidocaine 1% (for picc/midline insertions) Sum,lisinopril Sum,losartan potassium Sum,lr Sum,magnesium oxide Sum,metformin (glucophage) Sum,metoprolol succinate xl Sum,midazolam Sum,mirtazapine Sum,mycophenolate mofetil Sum,neomycin-polymyxin-bacitracin Sum,nitroglycerin sl Sum,olanzapine Sum,olanzapine (disintegrating tablet) Sum,ondansetron Sum,ondansetron odt Sum,oxycodone sr (oxycontin) Sum,pneumococcal 23-valent polysaccharide vaccine Sum,pneumococcal vac polyvalent Sum,potassium chl 20 meq / 1000 ml d5 1/2 ns Sum,potassium chloride (powder) Sum,potassium chloride replacement (critical care and oncology) Sum,potassium chloride replacement (oncology) Sum,pravastatin Sum,prochlorperazine Sum,quetiapine fumarate Sum,ramelteon Sum,ranitidine Sum,rosuvastatin calcium Sum,sodium chloride 0.9% flush Sum,soln Sum,sterile water Sum,sw Sum,tamsulosin Sum,torsemide Sum,tramadol Sum,trazodone Sum,valsartan Sum,zolpidem tartrate Sum,0.45% sodium chloride Average,1/2 ns Average,allopurinol Average,alprazolam Average,amiodarone Average,amlodipine Average,aspirin Average,aspirin ec Average,benzonatate Average,bisacodyl Average,captopril Average,carvedilol Average,ciprofloxacin iv Average,clonazepam Average,d5 1/2ns Average,d5ns Average,diltiazem Average,diltiazem extended-release Average,famotidine Average,fluoxetine Average,furosemide Average,glucagon Average,hydrochlorothiazide Average,ibuprofen Average,influenza vaccine quadrivalent Average,influenza virus vaccine Average,isosorbide mononitrate (extended release) Average,labetalol Average,lamotrigine Average,lidocaine 1% (for picc/midline insertions) Average,lisinopril Average,losartan potassium Average,lr Average,magnesium oxide Average,metformin (glucophage) Average,metoprolol succinate xl Average,midazolam Average,mirtazapine Average,mycophenolate mofetil Average,neomycin-polymyxin-bacitracin Average,nitroglycerin sl Average,olanzapine Average,olanzapine (disintegrating tablet) Average,ondansetron Average,ondansetron odt Average,oxycodone sr (oxycontin) Average,pneumococcal 23-valent polysaccharide vaccine Average,pneumococcal vac polyvalent Average,potassium chl 20 meq / 1000 ml d5 1/2 ns Average,potassium chloride (powder) Average,potassium chloride replacement (critical care and oncology) Average,potassium chloride replacement (oncology) Average,pravastatin Average,prochlorperazine Average,quetiapine fumarate Average,ramelteon Average,ranitidine Average,rosuvastatin calcium Average,sodium chloride 0.9% flush Average,soln Average,sterile water Average,sw Average,tamsulosin Average,torsemide Average,tramadol Average,trazodone Average,valsartan Average,zolpidem tartrate Average
0,10000032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,130.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,2.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,26.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,2.5
1,10000084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
# Merge admissions with prescriptions
final_df = adm_df.merge(med_pivot, on = ['subject_id'], how='inner')
display(final_df.head(3))
print(final_df['subject_id'].nunique())

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age,lengths_of_1st_admission,lengths_of_2nd_admission,0.45% sodium chloride Sum,1/2 ns Sum,allopurinol Sum,alprazolam Sum,amiodarone Sum,amlodipine Sum,aspirin Sum,aspirin ec Sum,benzonatate Sum,bisacodyl Sum,captopril Sum,carvedilol Sum,ciprofloxacin iv Sum,clonazepam Sum,d5 1/2ns Sum,d5ns Sum,diltiazem Sum,diltiazem extended-release Sum,famotidine Sum,fluoxetine Sum,furosemide Sum,glucagon Sum,hydrochlorothiazide Sum,ibuprofen Sum,influenza vaccine quadrivalent Sum,influenza virus vaccine Sum,isosorbide mononitrate (extended release) Sum,labetalol Sum,lamotrigine Sum,lidocaine 1% (for picc/midline insertions) Sum,lisinopril Sum,losartan potassium Sum,lr Sum,magnesium oxide Sum,metformin (glucophage) Sum,metoprolol succinate xl Sum,midazolam Sum,mirtazapine Sum,mycophenolate mofetil Sum,neomycin-polymyxin-bacitracin Sum,nitroglycerin sl Sum,olanzapine Sum,olanzapine (disintegrating tablet) Sum,ondansetron Sum,ondansetron odt Sum,oxycodone sr (oxycontin) Sum,pneumococcal 23-valent polysaccharide vaccine Sum,pneumococcal vac polyvalent Sum,potassium chl 20 meq / 1000 ml d5 1/2 ns Sum,potassium chloride (powder) Sum,potassium chloride replacement (critical care and oncology) Sum,potassium chloride replacement (oncology) Sum,pravastatin Sum,prochlorperazine Sum,quetiapine fumarate Sum,ramelteon Sum,ranitidine Sum,rosuvastatin calcium Sum,sodium chloride 0.9% flush Sum,soln Sum,sterile water Sum,sw Sum,tamsulosin Sum,torsemide Sum,tramadol Sum,trazodone Sum,valsartan Sum,zolpidem tartrate Sum,0.45% sodium chloride Average,1/2 ns Average,allopurinol Average,alprazolam Average,amiodarone Average,amlodipine Average,aspirin Average,aspirin ec Average,benzonatate Average,bisacodyl Average,captopril Average,carvedilol Average,ciprofloxacin iv Average,clonazepam Average,d5 1/2ns Average,d5ns Average,diltiazem Average,diltiazem extended-release Average,famotidine Average,fluoxetine Average,furosemide Average,glucagon Average,hydrochlorothiazide Average,ibuprofen Average,influenza vaccine quadrivalent Average,influenza virus vaccine Average,isosorbide mononitrate (extended release) Average,labetalol Average,lamotrigine Average,lidocaine 1% (for picc/midline insertions) Average,lisinopril Average,losartan potassium Average,lr Average,magnesium oxide Average,metformin (glucophage) Average,metoprolol succinate xl Average,midazolam Average,mirtazapine Average,mycophenolate mofetil Average,neomycin-polymyxin-bacitracin Average,nitroglycerin sl Average,olanzapine Average,olanzapine (disintegrating tablet) Average,ondansetron Average,ondansetron odt Average,oxycodone sr (oxycontin) Average,pneumococcal 23-valent polysaccharide vaccine Average,pneumococcal vac polyvalent Average,potassium chl 20 meq / 1000 ml d5 1/2 ns Average,potassium chloride (powder) Average,potassium chloride replacement (critical care and oncology) Average,potassium chloride replacement (oncology) Average,pravastatin Average,prochlorperazine Average,quetiapine fumarate Average,ramelteon Average,ranitidine Average,rosuvastatin calcium Average,sodium chloride 0.9% flush Average,soln Average,sterile water Average,sw Average,tamsulosin Average,torsemide Average,tramadol Average,trazodone Average,valsartan Average,zolpidem tartrate Average
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,F,48,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


33879


In [31]:
# Merge dataset with diagnoses
final_df = final_df.merge(diagnosis_pivot, on = ['subject_id'], how='inner')
display(final_df.head(3))
print(final_df['subject_id'].nunique())

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age,lengths_of_1st_admission,lengths_of_2nd_admission,0.45% sodium chloride Sum,1/2 ns Sum,allopurinol Sum,alprazolam Sum,amiodarone Sum,amlodipine Sum,aspirin Sum,aspirin ec Sum,benzonatate Sum,bisacodyl Sum,captopril Sum,carvedilol Sum,ciprofloxacin iv Sum,clonazepam Sum,d5 1/2ns Sum,d5ns Sum,diltiazem Sum,diltiazem extended-release Sum,famotidine Sum,fluoxetine Sum,furosemide Sum,glucagon Sum,hydrochlorothiazide Sum,ibuprofen Sum,influenza vaccine quadrivalent Sum,influenza virus vaccine Sum,isosorbide mononitrate (extended release) Sum,labetalol Sum,lamotrigine Sum,lidocaine 1% (for picc/midline insertions) Sum,lisinopril Sum,losartan potassium Sum,lr Sum,magnesium oxide Sum,metformin (glucophage) Sum,metoprolol succinate xl Sum,midazolam Sum,mirtazapine Sum,mycophenolate mofetil Sum,neomycin-polymyxin-bacitracin Sum,nitroglycerin sl Sum,olanzapine Sum,olanzapine (disintegrating tablet) Sum,ondansetron Sum,ondansetron odt Sum,oxycodone sr (oxycontin) Sum,pneumococcal 23-valent polysaccharide vaccine Sum,pneumococcal vac polyvalent Sum,potassium chl 20 meq / 1000 ml d5 1/2 ns Sum,potassium chloride (powder) Sum,potassium chloride replacement (critical care and oncology) Sum,potassium chloride replacement (oncology) Sum,pravastatin Sum,prochlorperazine Sum,quetiapine fumarate Sum,ramelteon Sum,ranitidine Sum,rosuvastatin calcium Sum,sodium chloride 0.9% flush Sum,soln Sum,sterile water Sum,sw Sum,tamsulosin Sum,torsemide Sum,tramadol Sum,trazodone Sum,valsartan Sum,zolpidem tartrate Sum,0.45% sodium chloride Average,1/2 ns Average,allopurinol Average,alprazolam Average,amiodarone Average,amlodipine Average,aspirin Average,aspirin ec Average,benzonatate Average,bisacodyl Average,captopril Average,carvedilol Average,ciprofloxacin iv Average,clonazepam Average,d5 1/2ns Average,d5ns Average,diltiazem Average,diltiazem extended-release Average,famotidine Average,fluoxetine Average,furosemide Average,glucagon Average,hydrochlorothiazide Average,ibuprofen Average,influenza vaccine quadrivalent Average,influenza virus vaccine Average,isosorbide mononitrate (extended release) Average,labetalol Average,lamotrigine Average,lidocaine 1% (for picc/midline insertions) Average,lisinopril Average,losartan potassium Average,lr Average,magnesium oxide Average,metformin (glucophage) Average,metoprolol succinate xl Average,midazolam Average,mirtazapine Average,mycophenolate mofetil Average,neomycin-polymyxin-bacitracin Average,nitroglycerin sl Average,olanzapine Average,olanzapine (disintegrating tablet) Average,ondansetron Average,ondansetron odt Average,oxycodone sr (oxycontin) Average,pneumococcal 23-valent polysaccharide vaccine Average,pneumococcal vac polyvalent Average,potassium chl 20 meq / 1000 ml d5 1/2 ns Average,potassium chloride (powder) Average,potassium chloride replacement (critical care and oncology) Average,potassium chloride replacement (oncology) Average,pravastatin Average,prochlorperazine Average,quetiapine fumarate Average,ramelteon Average,ranitidine Average,rosuvastatin calcium Average,sodium chloride 0.9% flush Average,soln Average,sterile water Average,sw Average,tamsulosin Average,torsemide Average,tramadol Average,trazodone Average,valsartan Average,zolpidem tartrate Average,"Acute kidney failure, unspecified",Acute posthemorrhagic anemia,"Alcohol abuse, unspecified","Anemia, unspecified","Anxiety disorder, unspecified","Anxiety state, unspecified",Aortocoronary bypass status,"Asthma, unspecified type, unspecified",Atherosclerotic heart disease of native coronary artery without angina pectoris,Atrial fibrillation,"Chronic airway obstruction, not elsewhere classified","Chronic kidney disease, unspecified","Chronic obstructive pulmonary disease, unspecified","Congestive heart failure, unspecified",Coronary atherosclerosis of native coronary artery,"Coronary atherosclerosis of unspecified type of vessel, native or graft",Dehydration,"Depressive disorder, not elsewhere classified","Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled",Do not resuscitate,Do not resuscitate status,Esophageal reflux,Essential (primary) hypertension,Gastro-esophageal reflux disease without esophagitis,"Gout, unspecified","Hyperlipidemia, unspecified","Hypertensive chronic kidney disease with stage 1 through stage 4 chronic kidney disease, or unspecified chronic kidney disease","Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage I through stage IV, or unspecified",Hyposmolality and/or hyponatremia,"Hypothyroidism, unspecified",Long term (current) use of anticoagulants,Long term (current) use of antithrombotics/antiplatelets,Long term (current) use of insulin,Long-term (current) use of anticoagulants,Long-term (current) use of aspirin,Long-term (current) use of insulin,"Major depressive disorder, single episode, unspecified","Nicotine dependence, cigarettes, uncomplicated","Obesity, unspecified",Obstructive sleep apnea (adult) (pediatric),Obstructive sleep apnea (adult)(pediatric),Old myocardial infarction,"Osteoporosis, unspecified",Other and unspecified hyperlipidemia,Other chronic pain,"Outcome of delivery, single liveborn",Percutaneous transluminal coronary angioplasty status,Personal history of nicotine dependence,Personal history of tobacco use,"Personal history of transient ischemic attack (TIA), and cerebral infarction without residual deficits",Personal history of venous thrombosis and embolism,"Pneumonia, organism unspecified",Pure hypercholesterolemia,Tobacco use disorder,Type 2 diabetes mellitus with diabetic chronic kidney disease,Type 2 diabetes mellitus without complications,Unspecified acquired hypothyroidism,"Unspecified asthma, uncomplicated",Unspecified atrial fibrillation,Unspecified essential hypertension,Unspecified place or not applicable,"Urinary tract infection, site not specified"
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,F,48,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


31386


In [52]:
final_df.shape  

(62772, 218)

### Additional Clinical Data Integration

In [33]:
# add admissions.admission_type

In [34]:
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

**Integrate Discharge Note Text from the discharge table of the first admission**

In [35]:
# Create a temporary table from final_df for subject_id and hadm_id
final_df[['subject_id', 'hadm_id', 'admittime']].to_sql('temp_los', conn, if_exists='replace', index=False)


62772

In [36]:
# Get the discharge notes from the discharge table ONLY for the first admisison
discharge_text = pd.read_sql_query("""
    SELECT d.subject_id, d.hadm_id, d.text
    FROM discharge d
    JOIN (
        SELECT subject_id, hadm_id
        FROM temp_los
        WHERE admittime = (SELECT MIN(admittime) 
                           FROM temp_los t 
                           WHERE t.subject_id = temp_los.subject_id)
    ) first_admissions 
    ON d.subject_id = first_admissions.subject_id 
    AND d.hadm_id = first_admissions.hadm_id
""", conn)
discharge_text.drop(columns='hadm_id', inplace=True)

In [37]:
# Merge discharge note text into final_df 
final_df = final_df.merge(discharge_text, on=['subject_id'], how='left')
final_df.head(2)

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age,lengths_of_1st_admission,lengths_of_2nd_admission,0.45% sodium chloride Sum,1/2 ns Sum,allopurinol Sum,alprazolam Sum,amiodarone Sum,amlodipine Sum,aspirin Sum,aspirin ec Sum,benzonatate Sum,bisacodyl Sum,captopril Sum,carvedilol Sum,ciprofloxacin iv Sum,clonazepam Sum,d5 1/2ns Sum,d5ns Sum,diltiazem Sum,diltiazem extended-release Sum,famotidine Sum,fluoxetine Sum,furosemide Sum,glucagon Sum,hydrochlorothiazide Sum,ibuprofen Sum,influenza vaccine quadrivalent Sum,influenza virus vaccine Sum,isosorbide mononitrate (extended release) Sum,labetalol Sum,lamotrigine Sum,lidocaine 1% (for picc/midline insertions) Sum,lisinopril Sum,losartan potassium Sum,lr Sum,magnesium oxide Sum,metformin (glucophage) Sum,metoprolol succinate xl Sum,midazolam Sum,mirtazapine Sum,mycophenolate mofetil Sum,neomycin-polymyxin-bacitracin Sum,nitroglycerin sl Sum,olanzapine Sum,olanzapine (disintegrating tablet) Sum,ondansetron Sum,ondansetron odt Sum,oxycodone sr (oxycontin) Sum,pneumococcal 23-valent polysaccharide vaccine Sum,pneumococcal vac polyvalent Sum,potassium chl 20 meq / 1000 ml d5 1/2 ns Sum,potassium chloride (powder) Sum,potassium chloride replacement (critical care and oncology) Sum,potassium chloride replacement (oncology) Sum,pravastatin Sum,prochlorperazine Sum,quetiapine fumarate Sum,ramelteon Sum,ranitidine Sum,rosuvastatin calcium Sum,sodium chloride 0.9% flush Sum,soln Sum,sterile water Sum,sw Sum,tamsulosin Sum,torsemide Sum,tramadol Sum,trazodone Sum,valsartan Sum,zolpidem tartrate Sum,0.45% sodium chloride Average,1/2 ns Average,allopurinol Average,alprazolam Average,amiodarone Average,amlodipine Average,aspirin Average,aspirin ec Average,benzonatate Average,bisacodyl Average,captopril Average,carvedilol Average,ciprofloxacin iv Average,clonazepam Average,d5 1/2ns Average,d5ns Average,diltiazem Average,diltiazem extended-release Average,famotidine Average,fluoxetine Average,furosemide Average,glucagon Average,hydrochlorothiazide Average,ibuprofen Average,influenza vaccine quadrivalent Average,influenza virus vaccine Average,isosorbide mononitrate (extended release) Average,labetalol Average,lamotrigine Average,lidocaine 1% (for picc/midline insertions) Average,lisinopril Average,losartan potassium Average,lr Average,magnesium oxide Average,metformin (glucophage) Average,metoprolol succinate xl Average,midazolam Average,mirtazapine Average,mycophenolate mofetil Average,neomycin-polymyxin-bacitracin Average,nitroglycerin sl Average,olanzapine Average,olanzapine (disintegrating tablet) Average,ondansetron Average,ondansetron odt Average,oxycodone sr (oxycontin) Average,pneumococcal 23-valent polysaccharide vaccine Average,pneumococcal vac polyvalent Average,potassium chl 20 meq / 1000 ml d5 1/2 ns Average,potassium chloride (powder) Average,potassium chloride replacement (critical care and oncology) Average,potassium chloride replacement (oncology) Average,pravastatin Average,prochlorperazine Average,quetiapine fumarate Average,ramelteon Average,ranitidine Average,rosuvastatin calcium Average,sodium chloride 0.9% flush Average,soln Average,sterile water Average,sw Average,tamsulosin Average,torsemide Average,tramadol Average,trazodone Average,valsartan Average,zolpidem tartrate Average,"Acute kidney failure, unspecified",Acute posthemorrhagic anemia,"Alcohol abuse, unspecified","Anemia, unspecified","Anxiety disorder, unspecified","Anxiety state, unspecified",Aortocoronary bypass status,"Asthma, unspecified type, unspecified",Atherosclerotic heart disease of native coronary artery without angina pectoris,Atrial fibrillation,"Chronic airway obstruction, not elsewhere classified","Chronic kidney disease, unspecified","Chronic obstructive pulmonary disease, unspecified","Congestive heart failure, unspecified",Coronary atherosclerosis of native coronary artery,"Coronary atherosclerosis of unspecified type of vessel, native or graft",Dehydration,"Depressive disorder, not elsewhere classified","Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled",Do not resuscitate,Do not resuscitate status,Esophageal reflux,Essential (primary) hypertension,Gastro-esophageal reflux disease without esophagitis,"Gout, unspecified","Hyperlipidemia, unspecified","Hypertensive chronic kidney disease with stage 1 through stage 4 chronic kidney disease, or unspecified chronic kidney disease","Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage I through stage IV, or unspecified",Hyposmolality and/or hyponatremia,"Hypothyroidism, unspecified",Long term (current) use of anticoagulants,Long term (current) use of antithrombotics/antiplatelets,Long term (current) use of insulin,Long-term (current) use of anticoagulants,Long-term (current) use of aspirin,Long-term (current) use of insulin,"Major depressive disorder, single episode, unspecified","Nicotine dependence, cigarettes, uncomplicated","Obesity, unspecified",Obstructive sleep apnea (adult) (pediatric),Obstructive sleep apnea (adult)(pediatric),Old myocardial infarction,"Osteoporosis, unspecified",Other and unspecified hyperlipidemia,Other chronic pain,"Outcome of delivery, single liveborn",Percutaneous transluminal coronary angioplasty status,Personal history of nicotine dependence,Personal history of tobacco use,"Personal history of transient ischemic attack (TIA), and cerebral infarction without residual deficits",Personal history of venous thrombosis and embolism,"Pneumonia, organism unspecified",Pure hypercholesterolemia,Tobacco use disorder,Type 2 diabetes mellitus with diabetic chronic kidney disease,Type 2 diabetes mellitus without complications,Unspecified acquired hypothyroidism,"Unspecified asthma, uncomplicated",Unspecified atrial fibrillation,Unspecified essential hypertension,Unspecified place or not applicable,"Urinary tract infection, site not specified",text
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: __...
1,10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: __...


In [53]:
final_df.shape  # check if the shape remained the same

(62772, 218)

**Integrate admission type from the admissions table**

In [39]:
adm_type = pd.read_sql_query("""
    SELECT subject_id, hadm_id,
           admission_type 
    FROM admissions
    GROUP BY subject_id, hadm_id
""", conn)


In [40]:
# Merge the DRG data into final_df
admtype_pivot =  adm_type.pivot_table(index='subject_id', columns = 'admission_type', aggfunc='size', fill_value=0)
admtype_pivot = (admtype_pivot > 0).astype(int)
final_df = final_df.merge(admtype_pivot, on=['subject_id'], how='left')


**Integrate DRG Severity and Mortality from the drgcodes table**

In [41]:
drg_sums = pd.read_sql_query("""
    SELECT subject_id, hadm_id,
           SUM(drg_severity) AS sum_drg_severity,
           SUM(drg_mortality) AS sum_drg_mortality
    FROM drgcodes
    GROUP BY subject_id, hadm_id
""", conn)


In [42]:
# Create a df with the DRG values for the admission dates
first_hadm = final_df.groupby('subject_id')['hadm_id'].first().reset_index()
drg_sums.drop(columns='subject_id', inplace=True)
merged_severity = first_hadm.merge(drg_sums, on='hadm_id', how='left')
merged_severity.head(2)


Unnamed: 0,subject_id,hadm_id,sum_drg_severity,sum_drg_mortality
0,10000084,23052089,1.0,1.0
1,10000117,22927623,,


In [43]:
# Merge the values with the final dataset
final_df = final_df.merge(merged_severity[['subject_id', 'sum_drg_severity', 'sum_drg_mortality']], on='subject_id', how='left')

# fill null values with 0 (scores from 1 to 4)
final_df['sum_drg_severity'] = final_df['sum_drg_severity'].fillna(0)
final_df['sum_drg_mortality'] = final_df['sum_drg_mortality'].fillna(0)

**Integrate Patient Weight from inputevents and supplement from omr**    

In [55]:
input_weights = pd.read_sql_query("""
    SELECT i.subject_id, i.hadm_id, i.patientweight
    FROM inputevents i
""", conn)

In [58]:
# Calculate the average standard deviation for weight across all patients
std_per_patient = input_weights.groupby('subject_id')['patientweight'].std()
average_std = std_per_patient.mean()
average_std

1.5729332240450333

An average standard deviation across patients, indicates that on average, patient's weight fluctuates of 1.57kg. 
Since the standard deviation for weight is quite low, we can take the average between the first and second admittime weight. 

In [62]:
# Aggregate the dataset to have only one row per subject_id
df_aggregated = input_weights.groupby('subject_id')["patientweight"].mean().reset_index()

# Now you can merge it with your other dataset
final_df = final_df.merge(df_aggregated, on='subject_id', how='left')
final_df.head(2)

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age,lengths_of_1st_admission,lengths_of_2nd_admission,0.45% sodium chloride Sum,1/2 ns Sum,allopurinol Sum,alprazolam Sum,amiodarone Sum,amlodipine Sum,aspirin Sum,aspirin ec Sum,benzonatate Sum,bisacodyl Sum,captopril Sum,carvedilol Sum,ciprofloxacin iv Sum,clonazepam Sum,d5 1/2ns Sum,d5ns Sum,diltiazem Sum,diltiazem extended-release Sum,famotidine Sum,fluoxetine Sum,furosemide Sum,glucagon Sum,hydrochlorothiazide Sum,ibuprofen Sum,influenza vaccine quadrivalent Sum,influenza virus vaccine Sum,isosorbide mononitrate (extended release) Sum,labetalol Sum,lamotrigine Sum,lidocaine 1% (for picc/midline insertions) Sum,lisinopril Sum,losartan potassium Sum,lr Sum,magnesium oxide Sum,metformin (glucophage) Sum,metoprolol succinate xl Sum,midazolam Sum,mirtazapine Sum,mycophenolate mofetil Sum,neomycin-polymyxin-bacitracin Sum,nitroglycerin sl Sum,olanzapine Sum,olanzapine (disintegrating tablet) Sum,ondansetron Sum,ondansetron odt Sum,oxycodone sr (oxycontin) Sum,pneumococcal 23-valent polysaccharide vaccine Sum,pneumococcal vac polyvalent Sum,potassium chl 20 meq / 1000 ml d5 1/2 ns Sum,potassium chloride (powder) Sum,potassium chloride replacement (critical care and oncology) Sum,potassium chloride replacement (oncology) Sum,pravastatin Sum,prochlorperazine Sum,quetiapine fumarate Sum,ramelteon Sum,ranitidine Sum,rosuvastatin calcium Sum,sodium chloride 0.9% flush Sum,soln Sum,sterile water Sum,sw Sum,tamsulosin Sum,torsemide Sum,tramadol Sum,trazodone Sum,valsartan Sum,zolpidem tartrate Sum,0.45% sodium chloride Average,1/2 ns Average,allopurinol Average,alprazolam Average,amiodarone Average,amlodipine Average,aspirin Average,aspirin ec Average,benzonatate Average,bisacodyl Average,captopril Average,carvedilol Average,ciprofloxacin iv Average,clonazepam Average,d5 1/2ns Average,d5ns Average,diltiazem Average,diltiazem extended-release Average,famotidine Average,fluoxetine Average,furosemide Average,glucagon Average,hydrochlorothiazide Average,ibuprofen Average,influenza vaccine quadrivalent Average,influenza virus vaccine Average,isosorbide mononitrate (extended release) Average,labetalol Average,lamotrigine Average,lidocaine 1% (for picc/midline insertions) Average,lisinopril Average,losartan potassium Average,lr Average,magnesium oxide Average,metformin (glucophage) Average,metoprolol succinate xl Average,midazolam Average,mirtazapine Average,mycophenolate mofetil Average,neomycin-polymyxin-bacitracin Average,nitroglycerin sl Average,olanzapine Average,olanzapine (disintegrating tablet) Average,ondansetron Average,ondansetron odt Average,oxycodone sr (oxycontin) Average,pneumococcal 23-valent polysaccharide vaccine Average,pneumococcal vac polyvalent Average,potassium chl 20 meq / 1000 ml d5 1/2 ns Average,potassium chloride (powder) Average,potassium chloride replacement (critical care and oncology) Average,potassium chloride replacement (oncology) Average,pravastatin Average,prochlorperazine Average,quetiapine fumarate Average,ramelteon Average,ranitidine Average,rosuvastatin calcium Average,sodium chloride 0.9% flush Average,soln Average,sterile water Average,sw Average,tamsulosin Average,torsemide Average,tramadol Average,trazodone Average,valsartan Average,zolpidem tartrate Average,"Acute kidney failure, unspecified",Acute posthemorrhagic anemia,"Alcohol abuse, unspecified","Anemia, unspecified","Anxiety disorder, unspecified","Anxiety state, unspecified",Aortocoronary bypass status,"Asthma, unspecified type, unspecified",Atherosclerotic heart disease of native coronary artery without angina pectoris,Atrial fibrillation,"Chronic airway obstruction, not elsewhere classified","Chronic kidney disease, unspecified","Chronic obstructive pulmonary disease, unspecified","Congestive heart failure, unspecified",Coronary atherosclerosis of native coronary artery,"Coronary atherosclerosis of unspecified type of vessel, native or graft",Dehydration,"Depressive disorder, not elsewhere classified","Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled",Do not resuscitate,Do not resuscitate status,Esophageal reflux,Essential (primary) hypertension,Gastro-esophageal reflux disease without esophagitis,"Gout, unspecified","Hyperlipidemia, unspecified","Hypertensive chronic kidney disease with stage 1 through stage 4 chronic kidney disease, or unspecified chronic kidney disease","Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage I through stage IV, or unspecified",Hyposmolality and/or hyponatremia,"Hypothyroidism, unspecified",Long term (current) use of anticoagulants,Long term (current) use of antithrombotics/antiplatelets,Long term (current) use of insulin,Long-term (current) use of anticoagulants,Long-term (current) use of aspirin,Long-term (current) use of insulin,"Major depressive disorder, single episode, unspecified","Nicotine dependence, cigarettes, uncomplicated","Obesity, unspecified",Obstructive sleep apnea (adult) (pediatric),Obstructive sleep apnea (adult)(pediatric),Old myocardial infarction,"Osteoporosis, unspecified",Other and unspecified hyperlipidemia,Other chronic pain,"Outcome of delivery, single liveborn",Percutaneous transluminal coronary angioplasty status,Personal history of nicotine dependence,Personal history of tobacco use,"Personal history of transient ischemic attack (TIA), and cerebral infarction without residual deficits",Personal history of venous thrombosis and embolism,"Pneumonia, organism unspecified",Pure hypercholesterolemia,Tobacco use disorder,Type 2 diabetes mellitus with diabetic chronic kidney disease,Type 2 diabetes mellitus without complications,Unspecified acquired hypothyroidism,"Unspecified asthma, uncomplicated",Unspecified atrial fibrillation,Unspecified essential hypertension,Unspecified place or not applicable,"Urinary tract infection, site not specified",text,AMBULATORY OBSERVATION,DIRECT EMER.,DIRECT OBSERVATION,ELECTIVE,EU OBSERVATION,EW EMER.,OBSERVATION ADMIT,SURGICAL SAME DAY ADMISSION,URGENT,sum_drg_severity,sum_drg_mortality,patientweight
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: __...,0,0,0,0,1,1,0,0,0,1.0,1.0,
1,10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: __...,0,0,0,0,1,1,0,0,0,1.0,1.0,


In [64]:
# Integrate with the weight from omr table
omr_weights = pd.read_sql_query("""
    SELECT o.subject_id, o.result_value AS patientweight_omr
    FROM omr o
    WHERE o.result_name = 'Weight (Lbs)'
""", conn)
omr_weights.head()

Unnamed: 0,subject_id,patientweight_omr
0,10000032,94.0
1,10000032,92.15
2,10000032,92.15
3,10000032,92.15
4,10000032,92.15


In [83]:
omr_weights['patientweight_omr'] = pd.to_numeric(omr_weights['patientweight_omr'], errors='coerce')


In [84]:
# Calculate the average standard deviation for weight across all patients
std_per_patient_omr = omr_weights.groupby('subject_id')['patientweight_omr'].std()
average_std_omr = std_per_patient_omr.mean()
average_std_omr

267.36344398376895

Since the standard deviation is quite high, we're first merging it with the subject_ids from our table to see whether it decreases to noral values. 

In [85]:
unique_subjects = final_df[['subject_id']].drop_duplicates()
merged_weights = unique_subjects.merge(omr_weights, on='subject_id', how='left')
merged_weights.head(2)

Unnamed: 0,subject_id,patientweight_omr
0,10000084,170.0
1,10000117,121.0


In [86]:
# Calculate the average standard deviation for weight across all patients
std_per_patient_omr = merged_weights.groupby('subject_id')['patientweight_omr'].std()
average_std_omr = std_per_patient_omr.mean()
average_std_omr


23.212074637522228

The standard deviation is more normal, so we can integrate it with our final dataset.

In [88]:
# Aggregate the dataset to have only one row per subject_id
avg_weight = omr_weights.groupby('subject_id', as_index=False)['patientweight_omr'].mean()
avg_weight.head()

Unnamed: 0,subject_id,patientweight_omr
0,10000032,93.104
1,10000084,170.0
2,10000117,111.129643
3,10000248,168.0
4,10000280,172.25


In [91]:
# Now you can merge it with your other dataset
final_df = final_df.merge(avg_weight, on='subject_id', how='left')
final_df.head(2)

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age,lengths_of_1st_admission,lengths_of_2nd_admission,0.45% sodium chloride Sum,1/2 ns Sum,allopurinol Sum,alprazolam Sum,amiodarone Sum,amlodipine Sum,aspirin Sum,aspirin ec Sum,benzonatate Sum,bisacodyl Sum,captopril Sum,carvedilol Sum,ciprofloxacin iv Sum,clonazepam Sum,d5 1/2ns Sum,d5ns Sum,diltiazem Sum,diltiazem extended-release Sum,famotidine Sum,fluoxetine Sum,furosemide Sum,glucagon Sum,hydrochlorothiazide Sum,ibuprofen Sum,influenza vaccine quadrivalent Sum,influenza virus vaccine Sum,isosorbide mononitrate (extended release) Sum,labetalol Sum,lamotrigine Sum,lidocaine 1% (for picc/midline insertions) Sum,lisinopril Sum,losartan potassium Sum,lr Sum,magnesium oxide Sum,metformin (glucophage) Sum,metoprolol succinate xl Sum,midazolam Sum,mirtazapine Sum,mycophenolate mofetil Sum,neomycin-polymyxin-bacitracin Sum,nitroglycerin sl Sum,olanzapine Sum,olanzapine (disintegrating tablet) Sum,ondansetron Sum,ondansetron odt Sum,oxycodone sr (oxycontin) Sum,pneumococcal 23-valent polysaccharide vaccine Sum,pneumococcal vac polyvalent Sum,potassium chl 20 meq / 1000 ml d5 1/2 ns Sum,potassium chloride (powder) Sum,potassium chloride replacement (critical care and oncology) Sum,potassium chloride replacement (oncology) Sum,pravastatin Sum,prochlorperazine Sum,quetiapine fumarate Sum,ramelteon Sum,ranitidine Sum,rosuvastatin calcium Sum,sodium chloride 0.9% flush Sum,soln Sum,sterile water Sum,sw Sum,tamsulosin Sum,torsemide Sum,tramadol Sum,trazodone Sum,valsartan Sum,zolpidem tartrate Sum,0.45% sodium chloride Average,1/2 ns Average,allopurinol Average,alprazolam Average,amiodarone Average,amlodipine Average,aspirin Average,aspirin ec Average,benzonatate Average,bisacodyl Average,captopril Average,carvedilol Average,ciprofloxacin iv Average,clonazepam Average,d5 1/2ns Average,d5ns Average,diltiazem Average,diltiazem extended-release Average,famotidine Average,fluoxetine Average,furosemide Average,glucagon Average,hydrochlorothiazide Average,ibuprofen Average,influenza vaccine quadrivalent Average,influenza virus vaccine Average,isosorbide mononitrate (extended release) Average,labetalol Average,lamotrigine Average,lidocaine 1% (for picc/midline insertions) Average,lisinopril Average,losartan potassium Average,lr Average,magnesium oxide Average,metformin (glucophage) Average,metoprolol succinate xl Average,midazolam Average,mirtazapine Average,mycophenolate mofetil Average,neomycin-polymyxin-bacitracin Average,nitroglycerin sl Average,olanzapine Average,olanzapine (disintegrating tablet) Average,ondansetron Average,ondansetron odt Average,oxycodone sr (oxycontin) Average,pneumococcal 23-valent polysaccharide vaccine Average,pneumococcal vac polyvalent Average,potassium chl 20 meq / 1000 ml d5 1/2 ns Average,potassium chloride (powder) Average,potassium chloride replacement (critical care and oncology) Average,potassium chloride replacement (oncology) Average,pravastatin Average,prochlorperazine Average,quetiapine fumarate Average,ramelteon Average,ranitidine Average,rosuvastatin calcium Average,sodium chloride 0.9% flush Average,soln Average,sterile water Average,sw Average,tamsulosin Average,torsemide Average,tramadol Average,trazodone Average,valsartan Average,zolpidem tartrate Average,"Acute kidney failure, unspecified",Acute posthemorrhagic anemia,"Alcohol abuse, unspecified","Anemia, unspecified","Anxiety disorder, unspecified","Anxiety state, unspecified",Aortocoronary bypass status,"Asthma, unspecified type, unspecified",Atherosclerotic heart disease of native coronary artery without angina pectoris,Atrial fibrillation,"Chronic airway obstruction, not elsewhere classified","Chronic kidney disease, unspecified","Chronic obstructive pulmonary disease, unspecified","Congestive heart failure, unspecified",Coronary atherosclerosis of native coronary artery,"Coronary atherosclerosis of unspecified type of vessel, native or graft",Dehydration,"Depressive disorder, not elsewhere classified","Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled",Do not resuscitate,Do not resuscitate status,Esophageal reflux,Essential (primary) hypertension,Gastro-esophageal reflux disease without esophagitis,"Gout, unspecified","Hyperlipidemia, unspecified","Hypertensive chronic kidney disease with stage 1 through stage 4 chronic kidney disease, or unspecified chronic kidney disease","Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage I through stage IV, or unspecified",Hyposmolality and/or hyponatremia,"Hypothyroidism, unspecified",Long term (current) use of anticoagulants,Long term (current) use of antithrombotics/antiplatelets,Long term (current) use of insulin,Long-term (current) use of anticoagulants,Long-term (current) use of aspirin,Long-term (current) use of insulin,"Major depressive disorder, single episode, unspecified","Nicotine dependence, cigarettes, uncomplicated","Obesity, unspecified",Obstructive sleep apnea (adult) (pediatric),Obstructive sleep apnea (adult)(pediatric),Old myocardial infarction,"Osteoporosis, unspecified",Other and unspecified hyperlipidemia,Other chronic pain,"Outcome of delivery, single liveborn",Percutaneous transluminal coronary angioplasty status,Personal history of nicotine dependence,Personal history of tobacco use,"Personal history of transient ischemic attack (TIA), and cerebral infarction without residual deficits",Personal history of venous thrombosis and embolism,"Pneumonia, organism unspecified",Pure hypercholesterolemia,Tobacco use disorder,Type 2 diabetes mellitus with diabetic chronic kidney disease,Type 2 diabetes mellitus without complications,Unspecified acquired hypothyroidism,"Unspecified asthma, uncomplicated",Unspecified atrial fibrillation,Unspecified essential hypertension,Unspecified place or not applicable,"Urinary tract infection, site not specified",text,AMBULATORY OBSERVATION,DIRECT EMER.,DIRECT OBSERVATION,ELECTIVE,EU OBSERVATION,EW EMER.,OBSERVATION ADMIT,SURGICAL SAME DAY ADMISSION,URGENT,sum_drg_severity,sum_drg_mortality,patientweight,patientweight_omr
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: __...,0,0,0,0,1,1,0,0,0,1.0,1.0,,170.0
1,10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: __...,0,0,0,0,1,1,0,0,0,1.0,1.0,,170.0


In [93]:
# Combine the two weight sources: use omr weight if available
final_df['patientweight'] = final_df['patientweight_omr'].combine_first(final_df['patientweight'])


In [94]:
# Drop the helper column
final_df.drop(columns=['patientweight_omr'], inplace=True)

In [95]:
# Closing connection
conn.close()

In [96]:
final_df.head()

Unnamed: 0,subject_id,hadm_id,admittime,dischtime,gender,anchor_age,lengths_of_1st_admission,lengths_of_2nd_admission,0.45% sodium chloride Sum,1/2 ns Sum,allopurinol Sum,alprazolam Sum,amiodarone Sum,amlodipine Sum,aspirin Sum,aspirin ec Sum,benzonatate Sum,bisacodyl Sum,captopril Sum,carvedilol Sum,ciprofloxacin iv Sum,clonazepam Sum,d5 1/2ns Sum,d5ns Sum,diltiazem Sum,diltiazem extended-release Sum,famotidine Sum,fluoxetine Sum,furosemide Sum,glucagon Sum,hydrochlorothiazide Sum,ibuprofen Sum,influenza vaccine quadrivalent Sum,influenza virus vaccine Sum,isosorbide mononitrate (extended release) Sum,labetalol Sum,lamotrigine Sum,lidocaine 1% (for picc/midline insertions) Sum,lisinopril Sum,losartan potassium Sum,lr Sum,magnesium oxide Sum,metformin (glucophage) Sum,metoprolol succinate xl Sum,midazolam Sum,mirtazapine Sum,mycophenolate mofetil Sum,neomycin-polymyxin-bacitracin Sum,nitroglycerin sl Sum,olanzapine Sum,olanzapine (disintegrating tablet) Sum,ondansetron Sum,ondansetron odt Sum,oxycodone sr (oxycontin) Sum,pneumococcal 23-valent polysaccharide vaccine Sum,pneumococcal vac polyvalent Sum,potassium chl 20 meq / 1000 ml d5 1/2 ns Sum,potassium chloride (powder) Sum,potassium chloride replacement (critical care and oncology) Sum,potassium chloride replacement (oncology) Sum,pravastatin Sum,prochlorperazine Sum,quetiapine fumarate Sum,ramelteon Sum,ranitidine Sum,rosuvastatin calcium Sum,sodium chloride 0.9% flush Sum,soln Sum,sterile water Sum,sw Sum,tamsulosin Sum,torsemide Sum,tramadol Sum,trazodone Sum,valsartan Sum,zolpidem tartrate Sum,0.45% sodium chloride Average,1/2 ns Average,allopurinol Average,alprazolam Average,amiodarone Average,amlodipine Average,aspirin Average,aspirin ec Average,benzonatate Average,bisacodyl Average,captopril Average,carvedilol Average,ciprofloxacin iv Average,clonazepam Average,d5 1/2ns Average,d5ns Average,diltiazem Average,diltiazem extended-release Average,famotidine Average,fluoxetine Average,furosemide Average,glucagon Average,hydrochlorothiazide Average,ibuprofen Average,influenza vaccine quadrivalent Average,influenza virus vaccine Average,isosorbide mononitrate (extended release) Average,labetalol Average,lamotrigine Average,lidocaine 1% (for picc/midline insertions) Average,lisinopril Average,losartan potassium Average,lr Average,magnesium oxide Average,metformin (glucophage) Average,metoprolol succinate xl Average,midazolam Average,mirtazapine Average,mycophenolate mofetil Average,neomycin-polymyxin-bacitracin Average,nitroglycerin sl Average,olanzapine Average,olanzapine (disintegrating tablet) Average,ondansetron Average,ondansetron odt Average,oxycodone sr (oxycontin) Average,pneumococcal 23-valent polysaccharide vaccine Average,pneumococcal vac polyvalent Average,potassium chl 20 meq / 1000 ml d5 1/2 ns Average,potassium chloride (powder) Average,potassium chloride replacement (critical care and oncology) Average,potassium chloride replacement (oncology) Average,pravastatin Average,prochlorperazine Average,quetiapine fumarate Average,ramelteon Average,ranitidine Average,rosuvastatin calcium Average,sodium chloride 0.9% flush Average,soln Average,sterile water Average,sw Average,tamsulosin Average,torsemide Average,tramadol Average,trazodone Average,valsartan Average,zolpidem tartrate Average,"Acute kidney failure, unspecified",Acute posthemorrhagic anemia,"Alcohol abuse, unspecified","Anemia, unspecified","Anxiety disorder, unspecified","Anxiety state, unspecified",Aortocoronary bypass status,"Asthma, unspecified type, unspecified",Atherosclerotic heart disease of native coronary artery without angina pectoris,Atrial fibrillation,"Chronic airway obstruction, not elsewhere classified","Chronic kidney disease, unspecified","Chronic obstructive pulmonary disease, unspecified","Congestive heart failure, unspecified",Coronary atherosclerosis of native coronary artery,"Coronary atherosclerosis of unspecified type of vessel, native or graft",Dehydration,"Depressive disorder, not elsewhere classified","Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled",Do not resuscitate,Do not resuscitate status,Esophageal reflux,Essential (primary) hypertension,Gastro-esophageal reflux disease without esophagitis,"Gout, unspecified","Hyperlipidemia, unspecified","Hypertensive chronic kidney disease with stage 1 through stage 4 chronic kidney disease, or unspecified chronic kidney disease","Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage I through stage IV, or unspecified",Hyposmolality and/or hyponatremia,"Hypothyroidism, unspecified",Long term (current) use of anticoagulants,Long term (current) use of antithrombotics/antiplatelets,Long term (current) use of insulin,Long-term (current) use of anticoagulants,Long-term (current) use of aspirin,Long-term (current) use of insulin,"Major depressive disorder, single episode, unspecified","Nicotine dependence, cigarettes, uncomplicated","Obesity, unspecified",Obstructive sleep apnea (adult) (pediatric),Obstructive sleep apnea (adult)(pediatric),Old myocardial infarction,"Osteoporosis, unspecified",Other and unspecified hyperlipidemia,Other chronic pain,"Outcome of delivery, single liveborn",Percutaneous transluminal coronary angioplasty status,Personal history of nicotine dependence,Personal history of tobacco use,"Personal history of transient ischemic attack (TIA), and cerebral infarction without residual deficits",Personal history of venous thrombosis and embolism,"Pneumonia, organism unspecified",Pure hypercholesterolemia,Tobacco use disorder,Type 2 diabetes mellitus with diabetic chronic kidney disease,Type 2 diabetes mellitus without complications,Unspecified acquired hypothyroidism,"Unspecified asthma, uncomplicated",Unspecified atrial fibrillation,Unspecified essential hypertension,Unspecified place or not applicable,"Urinary tract infection, site not specified",text,AMBULATORY OBSERVATION,DIRECT EMER.,DIRECT OBSERVATION,ELECTIVE,EU OBSERVATION,EW EMER.,OBSERVATION ADMIT,SURGICAL SAME DAY ADMISSION,URGENT,sum_drg_severity,sum_drg_mortality,patientweight
0,10000084,23052089,2160-11-21 01:56:00,2160-11-25 14:52:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: __...,0,0,0,0,1,1,0,0,0,1.0,1.0,170.0
1,10000084,29888819,2160-12-28 05:11:00,2160-12-28 16:07:00,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: __...,0,0,0,0,1,1,0,0,0,1.0,1.0,170.0
2,10000117,22927623,2181-11-15 02:05:00,2181-11-15 14:52:00,F,48,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: ___\n...,0,0,0,0,1,0,1,0,0,0.0,0.0,111.129643
3,10000117,27988844,2183-09-18 18:10:00,2183-09-21 16:30:00,F,48,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: ___\n...,0,0,0,0,1,0,1,0,0,0.0,0.0,111.129643
4,10000883,25221576,2124-05-14 21:11:00,2124-05-22 10:40:00,M,20,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,1,0,0,1,0,0,0,0,1.0,1.0,


In [97]:
# Remove the column hadm_id from admissions table and drop duplicated rows
final_df.drop(columns=['hadm_id', 'admittime', 'dischtime'], inplace=True)
final_df.drop_duplicates(inplace=True)

In [98]:
final_df.head()

Unnamed: 0,subject_id,gender,anchor_age,lengths_of_1st_admission,lengths_of_2nd_admission,0.45% sodium chloride Sum,1/2 ns Sum,allopurinol Sum,alprazolam Sum,amiodarone Sum,amlodipine Sum,aspirin Sum,aspirin ec Sum,benzonatate Sum,bisacodyl Sum,captopril Sum,carvedilol Sum,ciprofloxacin iv Sum,clonazepam Sum,d5 1/2ns Sum,d5ns Sum,diltiazem Sum,diltiazem extended-release Sum,famotidine Sum,fluoxetine Sum,furosemide Sum,glucagon Sum,hydrochlorothiazide Sum,ibuprofen Sum,influenza vaccine quadrivalent Sum,influenza virus vaccine Sum,isosorbide mononitrate (extended release) Sum,labetalol Sum,lamotrigine Sum,lidocaine 1% (for picc/midline insertions) Sum,lisinopril Sum,losartan potassium Sum,lr Sum,magnesium oxide Sum,metformin (glucophage) Sum,metoprolol succinate xl Sum,midazolam Sum,mirtazapine Sum,mycophenolate mofetil Sum,neomycin-polymyxin-bacitracin Sum,nitroglycerin sl Sum,olanzapine Sum,olanzapine (disintegrating tablet) Sum,ondansetron Sum,ondansetron odt Sum,oxycodone sr (oxycontin) Sum,pneumococcal 23-valent polysaccharide vaccine Sum,pneumococcal vac polyvalent Sum,potassium chl 20 meq / 1000 ml d5 1/2 ns Sum,potassium chloride (powder) Sum,potassium chloride replacement (critical care and oncology) Sum,potassium chloride replacement (oncology) Sum,pravastatin Sum,prochlorperazine Sum,quetiapine fumarate Sum,ramelteon Sum,ranitidine Sum,rosuvastatin calcium Sum,sodium chloride 0.9% flush Sum,soln Sum,sterile water Sum,sw Sum,tamsulosin Sum,torsemide Sum,tramadol Sum,trazodone Sum,valsartan Sum,zolpidem tartrate Sum,0.45% sodium chloride Average,1/2 ns Average,allopurinol Average,alprazolam Average,amiodarone Average,amlodipine Average,aspirin Average,aspirin ec Average,benzonatate Average,bisacodyl Average,captopril Average,carvedilol Average,ciprofloxacin iv Average,clonazepam Average,d5 1/2ns Average,d5ns Average,diltiazem Average,diltiazem extended-release Average,famotidine Average,fluoxetine Average,furosemide Average,glucagon Average,hydrochlorothiazide Average,ibuprofen Average,influenza vaccine quadrivalent Average,influenza virus vaccine Average,isosorbide mononitrate (extended release) Average,labetalol Average,lamotrigine Average,lidocaine 1% (for picc/midline insertions) Average,lisinopril Average,losartan potassium Average,lr Average,magnesium oxide Average,metformin (glucophage) Average,metoprolol succinate xl Average,midazolam Average,mirtazapine Average,mycophenolate mofetil Average,neomycin-polymyxin-bacitracin Average,nitroglycerin sl Average,olanzapine Average,olanzapine (disintegrating tablet) Average,ondansetron Average,ondansetron odt Average,oxycodone sr (oxycontin) Average,pneumococcal 23-valent polysaccharide vaccine Average,pneumococcal vac polyvalent Average,potassium chl 20 meq / 1000 ml d5 1/2 ns Average,potassium chloride (powder) Average,potassium chloride replacement (critical care and oncology) Average,potassium chloride replacement (oncology) Average,pravastatin Average,prochlorperazine Average,quetiapine fumarate Average,ramelteon Average,ranitidine Average,rosuvastatin calcium Average,sodium chloride 0.9% flush Average,soln Average,sterile water Average,sw Average,tamsulosin Average,torsemide Average,tramadol Average,trazodone Average,valsartan Average,zolpidem tartrate Average,"Acute kidney failure, unspecified",Acute posthemorrhagic anemia,"Alcohol abuse, unspecified","Anemia, unspecified","Anxiety disorder, unspecified","Anxiety state, unspecified",Aortocoronary bypass status,"Asthma, unspecified type, unspecified",Atherosclerotic heart disease of native coronary artery without angina pectoris,Atrial fibrillation,"Chronic airway obstruction, not elsewhere classified","Chronic kidney disease, unspecified","Chronic obstructive pulmonary disease, unspecified","Congestive heart failure, unspecified",Coronary atherosclerosis of native coronary artery,"Coronary atherosclerosis of unspecified type of vessel, native or graft",Dehydration,"Depressive disorder, not elsewhere classified","Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled",Do not resuscitate,Do not resuscitate status,Esophageal reflux,Essential (primary) hypertension,Gastro-esophageal reflux disease without esophagitis,"Gout, unspecified","Hyperlipidemia, unspecified","Hypertensive chronic kidney disease with stage 1 through stage 4 chronic kidney disease, or unspecified chronic kidney disease","Hypertensive chronic kidney disease, unspecified, with chronic kidney disease stage I through stage IV, or unspecified",Hyposmolality and/or hyponatremia,"Hypothyroidism, unspecified",Long term (current) use of anticoagulants,Long term (current) use of antithrombotics/antiplatelets,Long term (current) use of insulin,Long-term (current) use of anticoagulants,Long-term (current) use of aspirin,Long-term (current) use of insulin,"Major depressive disorder, single episode, unspecified","Nicotine dependence, cigarettes, uncomplicated","Obesity, unspecified",Obstructive sleep apnea (adult) (pediatric),Obstructive sleep apnea (adult)(pediatric),Old myocardial infarction,"Osteoporosis, unspecified",Other and unspecified hyperlipidemia,Other chronic pain,"Outcome of delivery, single liveborn",Percutaneous transluminal coronary angioplasty status,Personal history of nicotine dependence,Personal history of tobacco use,"Personal history of transient ischemic attack (TIA), and cerebral infarction without residual deficits",Personal history of venous thrombosis and embolism,"Pneumonia, organism unspecified",Pure hypercholesterolemia,Tobacco use disorder,Type 2 diabetes mellitus with diabetic chronic kidney disease,Type 2 diabetes mellitus without complications,Unspecified acquired hypothyroidism,"Unspecified asthma, uncomplicated",Unspecified atrial fibrillation,Unspecified essential hypertension,Unspecified place or not applicable,"Urinary tract infection, site not specified",text,AMBULATORY OBSERVATION,DIRECT EMER.,DIRECT OBSERVATION,ELECTIVE,EU OBSERVATION,EW EMER.,OBSERVATION ADMIT,SURGICAL SAME DAY ADMISSION,URGENT,sum_drg_severity,sum_drg_mortality,patientweight
0,10000084,M,72,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,37.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,18.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: __...,0,0,0,0,1,1,0,0,0,1.0,1.0,170.0
2,10000117,F,48,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,\nName: ___ Unit No: ___\n...,0,0,0,0,1,0,1,0,0,0.0,0.0,111.129643
4,10000883,M,20,7,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.75,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.4375,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,,0,1,0,0,1,0,0,0,0,1.0,1.0,
6,10001217,F,55,6,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,32.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,\nName: ___ Unit No: ___\n \n...,0,1,0,0,0,1,0,0,0,3.0,4.0,149.15
8,10001877,M,89,1,1,0.0,0.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,15.0,0.0,25.0,0.0,0.0,0.0,0.0,100.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,7.5,0.0,25.0,0.0,0.0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,\nName: ___ ___ No: ___\n \...,0,0,0,0,1,1,0,0,0,0.0,0.0,


In [99]:
final_df.shape

(31386, 216)

In [100]:
# Store the final dataset as a csv
final_df.to_csv('../data/data_final.csv', index=False)