In [1]:
import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import tensorflow
import keras
import numpy as np
np.random.seed(123)
import matplotlib
from keras.models import * 
from keras.layers import *
from keras.optimizers import RMSprop
import pandas as pd

# Import both datasets, change to local path when running
patients = pd.read_excel(r"./model_dir/PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level2_AI_Patient Traits")

allergies = pd.read_excel(r"./model_dir/PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level1_Patient Allergens")


In [2]:


# Merge columns by ID if needed
patientAllergies = patients.merge(allergies, on = "SFM Id")
# Comment line before committing
# patientAllergies.head()

patientsTrimmed = patients.drop(['SFM Id', 'City', 'State', 'Country'], axis = 1)

patientsTrimmed['Gender'] = pd.Categorical(patientsTrimmed['Gender'].str.strip())
gender_onehot = pd.get_dummies(patientsTrimmed['Gender'], prefix = "Gender",
                                    prefix_sep = "-", dtype = int)
patientsTrimmed = patientsTrimmed.drop('Gender', axis = 1)
patientsTrimmed = patientsTrimmed.join(gender_onehot)

patientsTrimmed['SkinTone'] = pd.Categorical(patientsTrimmed['SkinTone'].str.strip())
skintone_onehot = pd.get_dummies(patientsTrimmed['SkinTone'], prefix = "SkinTone",
                                    prefix_sep = "-", dtype = int)
patientsTrimmed = patientsTrimmed.drop('SkinTone', axis = 1)
patientsTrimmed = patientsTrimmed.join(skintone_onehot)

patientsTrimmed['FitzpatrickSkinPhotoType'] = pd.Categorical(patientsTrimmed['FitzpatrickSkinPhotoType'].str.strip())
# Dropping first here since it is a blank variable in the column
fitzpatrick_onehot = pd.get_dummies(patientsTrimmed['FitzpatrickSkinPhotoType'], prefix = "Fitzpatrick",
                                    prefix_sep = "-", drop_first = True, dtype = int)
patientsTrimmed = patientsTrimmed.drop('FitzpatrickSkinPhotoType', axis = 1)
patientsTrimmed = patientsTrimmed.join(fitzpatrick_onehot)


from keras.layers import TextVectorization
from sklearn.preprocessing import MultiLabelBinarizer
mlb_1 = MultiLabelBinarizer()

mlb_1 = MultiLabelBinarizer()

# Replace commas with whitespace
patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].str.replace(',', ' ')
# Set the max length based on whitespace characters
max_len = patientsTrimmed['SkinConditions'].str.count(' ').max()

patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].str.split()

# Remove items containing the word "Other" from the list
patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].apply(lambda x: [item for item in x if "Other" not in item])

# One-hot encode data
skinConditions = mlb_1.fit_transform(patientsTrimmed['SkinConditions'])


# remove rare conditions 
skinConditions = np.delete(skinConditions,[8,9], axis=1)

# Get the list of encoded classes
encoded_classes = list(mlb_1.classes_)

patientsTrimmed = patientsTrimmed.drop('SkinConditions', axis = 1)
patientsArr = patientsTrimmed.values

# input_data = skinConditions

print(patientsArr)

input_data = np.concatenate((patientsArr, skinConditions), axis = 1)

# Drop ID column for preprocessing - ID should have no effect on prediction
allergiesNoId = allergies.drop('SFM Id', axis = 1)
# Remove all non-digit characters, then replace empty cells with NaN
allergiesNoId = allergiesNoId.replace(r'\D+', '', regex = True).replace('', np.nan)
# Set all NaN cells to 0
allergiesNoId = allergiesNoId.fillna(0)
# Convert entire dataframe to integer
allergiesNoId = allergiesNoId.astype(int)


allergiesNoId['AllergiesList'] = allergiesNoId.astype(str).apply(' '.join, axis=1)
allergiesNoId['AllergiesList'] = allergiesNoId['AllergiesList'].str.split()


[[2000    0    1 ...    0    0    0]
 [1946    1    0 ...    0    0    0]
 [1950    1    0 ...    0    0    0]
 ...
 [1977    1    0 ...    0    0    0]
 [1981    1    0 ...    0    0    0]
 [1960    1    0 ...    0    0    0]]


In [3]:
# Create MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

allergiesArray = np.array(allergiesNoId['AllergiesList'])
# One-hot encode data
allergiesArray = mlb.fit_transform(allergiesArray)

print (list(mlb.classes_))

transposed_array = list(map(list, zip(*allergiesArray)))

# Count the number of 1s in each column
column_counts = [column.count(1) for column in transposed_array]

# Create a list of (column_index, count) tuples and rank them
ranked_columns = sorted(enumerate(column_counts, 1), key=lambda x: x[1], reverse=True)

# print the results
# for rank, (column_index, count) in enumerate(ranked_columns, 1):
#     print(f"Rank {rank}: Column {column_index} - Number of 1s: {count}")

top_rank_n = 10

top_column_list = [row[0] for row in ranked_columns]

top_column_list = top_column_list[0:top_rank_n]

top_rank_allergies = [[row[col -1] for col in top_column_list] for row in allergiesArray]

top_rank_allergies


allergiesArray = np.array(top_rank_allergies)

# Debug only getting one Colum 
# allergiesArray = np.array([row[1] for row in allergiesArray])

# Debug for removing first 1s col 
allergiesArray = allergiesArray[:, 1:]

['0', '100612', '100613', '100702', '100857', '102', '10260', '103637', '104', '1043', '104630', '105017', '10537', '10538', '10539', '10541', '10544', '10546', '10552', '105611', '106344', '106518', '107036', '1107', '111858', '112', '112022', '1126', '113', '1130', '1131', '114559', '1149', '1153', '11661', '11662', '11686', '117', '1177', '11790', '119', '11996', '121146', '121641', '121642', '121652', '122055', '12344', '123717', '124', '124537', '124737', '12522', '128', '128564', '128565', '129742', '129747', '13', '130118', '13018', '130735', '131994', '131995', '131996', '131997', '132762', '133', '133255', '133260', '133266', '133268', '133269', '133270', '133271', '133272', '133273', '133274', '133275', '133628', '133629', '133714', '134', '134256', '134795', '134800', '135153', '136', '136640', '136703', '136704', '13691', '13708', '137787', '137788', '137789', '137790', '137791', '137792', '137793', '137794', '137795', '1385', '1387', '13881', '138878', '13891', '139001', '

In [13]:
from sklearn.model_selection import train_test_split
from imblearn.ensemble import BalancedRandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

# Assuming input_data and allergiesArray are defined previously in your code
X_train, X_test, y_train, y_test = train_test_split(input_data, allergiesArray, test_size=0.2, random_state=42, shuffle=True)

# Use the Balanced Random Forest in the MultiOutputClassifier
# -------------------- Baseline -------------------- 
brf_classifier = MultiOutputClassifier(RandomForestClassifier(random_state=0, n_estimators=1000))

brf_classifier.fit(X_train, y_train)

In [5]:
brf_predictions = brf_classifier.predict(X_test)

svm_accuracy = accuracy_score(y_test, brf_predictions)
print(f"Accuracy: {svm_accuracy}")

print(classification_report(y_test, brf_predictions))

Accuracy: 0.012121212121212121
              precision    recall  f1-score   support

           0       0.36      0.52      0.42       664
           1       0.28      0.58      0.38       456
           2       0.25      0.56      0.34       426
           3       0.19      0.54      0.29       328
           4       0.14      0.53      0.23       249
           5       0.14      0.47      0.22       276
           6       0.11      0.44      0.18       225
           7       0.15      0.56      0.23       235
           8       0.12      0.54      0.20       218

   micro avg       0.20      0.53      0.29      3077
   macro avg       0.19      0.53      0.28      3077
weighted avg       0.23      0.53      0.31      3077
 samples avg       0.19      0.40      0.23      3077



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# -------------------- Upweight the Minority Class -------------------- 
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.utils import class_weight

counts = np.sum(y_train == 1, axis=0)

target_value = np.sum(counts) / len(counts)

# Calculate weights for each column
weights = target_value / counts

class_weights = {i: weights[i] for i in range(len(weights))}



In [14]:
brf_classifier_oversample_weights = MultiOutputClassifier(BalancedRandomForestClassifier(random_state=0, n_estimators=1000, class_weight=class_weights))

# Continue with training and evaluation as usual
brf_classifier_oversample_weights.fit(X_train, y_train)

  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [15]:
svm_predictions = brf_classifier_oversample_weights.predict(X_test)

svm_accuracy = accuracy_score(y_test, svm_predictions)
print(f"Accuracy: {svm_accuracy}")

print(classification_report(y_test, svm_predictions,zero_division=0))

Accuracy: 0.013131313131313131
              precision    recall  f1-score   support

           0       0.36      0.52      0.42       664
           1       0.28      0.57      0.37       456
           2       0.25      0.55      0.34       426
           3       0.20      0.54      0.29       328
           4       0.14      0.53      0.22       249
           5       0.14      0.47      0.22       276
           6       0.11      0.43      0.17       225
           7       0.15      0.55      0.23       235
           8       0.12      0.52      0.20       218

   micro avg       0.19      0.53      0.28      3077
   macro avg       0.19      0.52      0.27      3077
weighted avg       0.23      0.53      0.31      3077
 samples avg       0.19      0.40      0.23      3077



In [9]:
# Deep Learning model 
complex1NNmodel = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.5),  # Dropout for regularization
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),  # Dropout for regularization
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),  # Dropout for regularization
    tf.keras.layers.Dense(9, activation='sigmoid')  # Sigmoid activation for multi-label classification
])

complex1NNmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'],sample_weight_mode=class_weights)


complex1NNmodel.fit(X_train, y_train, epochs=20, batch_size=32, validation_split=0.2)

2024-04-27 14:55:34.800176: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x47b7d0c90>

In [10]:
y_pred = complex1NNmodel.predict(X_test)
y_pred_binary = (y_pred > 0.2).astype(int)


print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       0.34      1.00      0.50       664
           1       0.23      1.00      0.37       456
           2       0.22      1.00      0.35       426
           3       0.00      0.00      0.00       328
           4       0.00      0.00      0.00       249
           5       0.00      0.00      0.00       276
           6       0.00      0.00      0.00       225
           7       0.00      0.00      0.00       235
           8       0.00      0.00      0.00       218

   micro avg       0.26      0.50      0.34      3077
   macro avg       0.09      0.33      0.14      3077
weighted avg       0.14      0.50      0.21      3077
 samples avg       0.26      0.40      0.29      3077



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
