In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
# In order to train the medication recommendation procedure
# we need to load in the MIMIC-III demo diagnosis and prescription data
# './incomplete_mimic-iii-clinical-database-demo-1.4' 
# is just where the csv files are stored
diagnoses = pd.read_csv('./incomplete_mimic-iii-clinical-database-demo-1.4/DIAGNOSES_ICD.csv')
prescriptions = pd.read_csv('./incomplete_mimic-iii-clinical-database-demo-1.4/PRESCRIPTIONS.csv')

In [3]:
# Patients are identified by their subject_id
# We will group the diagnoses and prescriptions by subject_id
# so that we can train a model to predict the medications
# based on the diagnoses
diagnoses_by_patient = diagnoses.groupby('subject_id')['icd9_code'].apply(list).reset_index()
prescriptions_by_patient = prescriptions.groupby('subject_id')['drug'].apply(list).reset_index()

In [4]:
# Pandas merge function is used to combine the two dataframes
# on the subject_id column
data = pd.merge(diagnoses_by_patient, prescriptions_by_patient, on='subject_id')

In [5]:
# In machine learning/statistics, we often use X to represent the features
# and y to represent the target variable
# So we intend to learn from the diagnoses (X) to predict the prescriptions (y)
X = data['icd9_code']
y = data['drug']

In [6]:
# We will now prepare the date for machine learning
# This is known as Feature Engineering
# MultiLabelBinarizer (mlb) helps us to convert the categorical diagnosis codes
# into a binary matrix (a matrix of 0's and 1's).
# This is necessary because machine learning models require numerical data
mlb_diagnosis = MultiLabelBinarizer()

In [7]:
# We will fit the MultiLabelBinarizer (mlb) on the diagnosis data
# and then transform the data into a binary matrix format
X_encoded = mlb_diagnosis.fit_transform(X)

In [8]:
# We will also use MultiLabelBinarizer (mlb) to convert the prescription data
# into a binary matrix format
# Below is the code from the previous two cells combined
# but for the prescription data
mlb_prescription = MultiLabelBinarizer()
y_encoded = mlb_prescription.fit_transform(y)

In [9]:
# Use the train_test_split method to split the data
# Here we are using 80% of the data for training and 20% for testing
# We also set a random_state so that the data is split in the same way each time
# This is useful for reproducibility
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_encoded, test_size=0.2, random_state=7)

In [10]:
# We'll now train the machine learning model
# Here, the model will learn patterns in the diagnoses data
# to predict the prescriptions
# The model will be a Random Forest (rf) Classifier, 
# which multiple decision trees to make predictions
# We will use 100 decision trees in the Random Forest
# We also set the random_state = 7 
# so that the model is trained in the same way each time
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=7)

In [11]:
# Here, the Random Forest Classifier is wrapped with in a MultiOutputClassifier
# The MultiOutputClassifier allows for multiple prescriptions to be predicted
# for each set of input patient diagnoses
multi_target_rf = MultiOutputClassifier(rf_classifier)
# Finally, fit() is a method that trains the model
# on the training data 
# X_train is the input data (patient diagnoses)
# y_train is the output data (prescription recommendations)
multi_target_rf.fit(X_train, y_train)

In [13]:
# The following recommend_medications() function takes in
# a list of diagnosis codes and returns the top 5 recommended medications
# based on the trained model

def recommend_medications(diagnosis_codes):
    # Here the square brackets enclosing the diagnosis_codes ensure that
    # the input is treated as a list of diagnosis codes
    # The transform() method converts the diagnosis codes into a binary format
    # that the model can understand
    patient_features = mlb_diagnosis.transform([diagnosis_codes])
    
    # predict_proba() is used to get the predicted probabilities
    # of each medication being recommended
    pred_probas = multi_target_rf.predict_proba(patient_features)
    
    # Get top 5 recommended medications names and return them
    top_5 = np.argsort(pred_probas[0])[::-1][:5]
    recommended_meds = mlb_prescription.classes_[top_5]
    
    return recommended_meds

In [None]:
# As an example, let us use the machine learning model to
# recommend medications given the following patient diagnoses
# 571.5: Cirrhosis of liver without mention of alcohol
# 250.00: Diabetes mellitus without mention of complication, 
# type II or unspecified type, not stated as uncontrolled
# Use the recommend_medications() function to get the recommended medication(s)
patient_diagnoses = ['571.5', '250.00']  
recommended_medications = recommend_medications(patient_diagnoses)
print("Recommended medications:", recommended_medications)