In [489]:
# Code to create a model used to predict whether the individual patient has an allergy
# Use different notebook to load the model and return a prediction

import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import tensorflow
import keras

import numpy as np
np.random.seed(123)

In [490]:
import matplotlib
from keras.models import * 
from keras.layers import *
from keras.optimizers import RMSprop
import pandas as pd

# Import both datasets, change to local path when running
patients = pd.read_excel(r"./model_dir/PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level2_AI_Patient Traits")

allergies = pd.read_excel(r"./model_dir/PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level1_Patient Allergens")

In [491]:
# Merge columns by ID if needed
patientAllergies = patients.merge(allergies, on = "SFM Id")
# Comment line before committing
# patientAllergies.head()

In [492]:
# Drop ID and location columns from dataframe
patientsTrimmed = patients.drop(['SFM Id', 'City', 'State', 'Country'], axis = 1)

In [493]:
# One-hot encode Gender column
patientsTrimmed['Gender'] = pd.Categorical(patientsTrimmed['Gender'].str.strip())
gender_onehot = pd.get_dummies(patientsTrimmed['Gender'], prefix = "Gender",
                                    prefix_sep = "-", dtype = int)
patientsTrimmed = patientsTrimmed.drop('Gender', axis = 1)
patientsTrimmed = patientsTrimmed.join(gender_onehot)

In [494]:
# One-hot encode SkinTone column
patientsTrimmed['SkinTone'] = pd.Categorical(patientsTrimmed['SkinTone'].str.strip())
skintone_onehot = pd.get_dummies(patientsTrimmed['SkinTone'], prefix = "SkinTone",
                                    prefix_sep = "-", dtype = int)
patientsTrimmed = patientsTrimmed.drop('SkinTone', axis = 1)
patientsTrimmed = patientsTrimmed.join(skintone_onehot)

In [495]:
# One-hot encode FitzPatrickSkinPhotoType column
patientsTrimmed['FitzpatrickSkinPhotoType'] = pd.Categorical(patientsTrimmed['FitzpatrickSkinPhotoType'].str.strip())
# Dropping first here since it is a blank variable in the column
fitzpatrick_onehot = pd.get_dummies(patientsTrimmed['FitzpatrickSkinPhotoType'], prefix = "Fitzpatrick",
                                    prefix_sep = "-", drop_first = True, dtype = int)
patientsTrimmed = patientsTrimmed.drop('FitzpatrickSkinPhotoType', axis = 1)
patientsTrimmed = patientsTrimmed.join(fitzpatrick_onehot)

In [496]:
# Switching to TextVectorization (Tokenizer is deprecated)
from keras.layers import TextVectorization
from sklearn.preprocessing import MultiLabelBinarizer
mlb_1 = MultiLabelBinarizer()

mlb_1 = MultiLabelBinarizer()

# Replace commas with whitespace
patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].str.replace(',', ' ')
# Set the max length based on whitespace characters
max_len = patientsTrimmed['SkinConditions'].str.count(' ').max()

patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].str.split()

# Remove items containing the word "Other" from the list
patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].apply(lambda x: [item for item in x if "Other" not in item])

# One-hot encode data
skinConditions = mlb_1.fit_transform(patientsTrimmed['SkinConditions'])


# remove rare conditions 
skinConditions = np.delete(skinConditions,[8,9], axis=1)

# Get the list of encoded classes
encoded_classes = list(mlb_1.classes_)


In [497]:
patientsTrimmed = patientsTrimmed.drop('SkinConditions', axis = 1)
patientsArr = patientsTrimmed.values

# input_data = skinConditions

input_data = np.concatenate((patientsArr, skinConditions), axis = 1)

# input_data = input_data[:, 1:]

In [498]:
# Drop ID column for preprocessing - ID should have no effect on prediction
allergiesNoId = allergies.drop('SFM Id', axis = 1)
# Remove all non-digit characters, then replace empty cells with NaN
allergiesNoId = allergiesNoId.replace(r'\D+', '', regex = True).replace('', np.nan)
# Set all NaN cells to 0
allergiesNoId = allergiesNoId.fillna(0)
# Convert entire dataframe to integer
allergiesNoId = allergiesNoId.astype(int)

In [499]:
allergiesNoId['AllergiesList'] = allergiesNoId.astype(str).apply(' '.join, axis=1)
allergiesNoId['AllergiesList'] = allergiesNoId['AllergiesList'].str.split()

In [500]:

# Create MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

allergiesArray = np.array(allergiesNoId['AllergiesList'])
# One-hot encode data
allergiesArray = mlb.fit_transform(allergiesArray)

print (list(mlb.classes_))

transposed_array = list(map(list, zip(*allergiesArray)))

# Count the number of 1s in each column
column_counts = [column.count(1) for column in transposed_array]

# Create a list of (column_index, count) tuples and rank them
ranked_columns = sorted(enumerate(column_counts, 1), key=lambda x: x[1], reverse=True)

# print the results
# for rank, (column_index, count) in enumerate(ranked_columns, 1):
#     print(f"Rank {rank}: Column {column_index} - Number of 1s: {count}")

top_rank_n = 10

top_column_list = [row[0] for row in ranked_columns]

top_column_list = top_column_list[0:top_rank_n]

top_rank_allergies = [[row[col -1] for col in top_column_list] for row in allergiesArray]

top_rank_allergies


allergiesArray = np.array(top_rank_allergies)

# Debug only getting one Colum 
# allergiesArray = np.array([row[1] for row in allergiesArray])

# Debug for removing first 1s col 
allergiesArray = allergiesArray[:, 1:]


['0', '100612', '100613', '100702', '100857', '102', '10260', '103637', '104', '1043', '104630', '105017', '10537', '10538', '10539', '10541', '10544', '10546', '10552', '105611', '106344', '106518', '107036', '1107', '111858', '112', '112022', '1126', '113', '1130', '1131', '114559', '1149', '1153', '11661', '11662', '11686', '117', '1177', '11790', '119', '11996', '121146', '121641', '121642', '121652', '122055', '12344', '123717', '124', '124537', '124737', '12522', '128', '128564', '128565', '129742', '129747', '13', '130118', '13018', '130735', '131994', '131995', '131996', '131997', '132762', '133', '133255', '133260', '133266', '133268', '133269', '133270', '133271', '133272', '133273', '133274', '133275', '133628', '133629', '133714', '134', '134256', '134795', '134800', '135153', '136', '136640', '136703', '136704', '13691', '13708', '137787', '137788', '137789', '137790', '137791', '137792', '137793', '137794', '137795', '1385', '1387', '13881', '138878', '13891', '139001', '

In [501]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from keras.losses import categorical_crossentropy
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import classification_report
from sklearn.datasets import make_multilabel_classification

# SVC Model
# x_1, y_1 = make_multilabel_classification(n_classes=3, random_state=0)

X_train, X_test, y_train, y_test = train_test_split(input_data, allergiesArray, test_size=0.2, random_state=42, shuffle=True)

# X_train, X_test, y_train, y_test = train_test_split(x_1, y_1, test_size=0.2, random_state=42, shuffle=True)

svm_classifier = MultiOutputClassifier(SVC(kernel="rbf"))

svm_classifier.fit(X_train, y_train)

svm_predictions = svm_classifier.predict(X_test)

svm_accuracy = accuracy_score(y_test, svm_predictions)
print(f"Accuracy: {svm_accuracy}")

print(classification_report(y_test, svm_predictions))


Accuracy: 0.24141414141414141
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       664
           1       0.00      0.00      0.00       456
           2       0.00      0.00      0.00       426
           3       0.00      0.00      0.00       328
           4       0.00      0.00      0.00       249
           5       0.00      0.00      0.00       276
           6       0.00      0.00      0.00       225
           7       0.00      0.00      0.00       235
           8       0.00      0.00      0.00       218

   micro avg       0.00      0.00      0.00      3077
   macro avg       0.00      0.00      0.00      3077
weighted avg       0.00      0.00      0.00      3077
 samples avg       0.00      0.00      0.00      3077



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [504]:
# Deep Learning model 
complex1NNmodel = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dropout(0.5),  # Dropout for regularization
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),  # Dropout for regularization
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dropout(0.5),  # Dropout for regularization
    tf.keras.layers.Dense(9, activation='sigmoid')  # Sigmoid activation for multi-label classification
])

complex1NNmodel.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


complex1NNmodel.fit(X_train, y_train, epochs=1000, batch_size=32, validation_split=0.2)


Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
E

<keras.callbacks.History at 0x2daa9fb90>

In [505]:
y_pred = complex1NNmodel.predict(X_test)
y_pred_binary = (y_pred > 0.2).astype(int)


print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       0.34      1.00      0.50       664
           1       0.23      1.00      0.37       456
           2       0.22      1.00      0.35       426
           3       0.00      0.00      0.00       328
           4       0.00      0.00      0.00       249
           5       0.00      0.00      0.00       276
           6       0.00      0.00      0.00       225
           7       0.00      0.00      0.00       235
           8       0.00      0.00      0.00       218

   micro avg       0.26      0.50      0.34      3077
   macro avg       0.09      0.33      0.14      3077
weighted avg       0.14      0.50      0.21      3077
 samples avg       0.26      0.40      0.29      3077



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [503]:
# Random Forest Model
from sklearn.ensemble import RandomForestClassifier

# SVC Model
# x_1, y_1 = make_multilabel_classification(n_classes=3, random_state=0)

X_train, X_test, y_train, y_test = train_test_split(input_data, allergiesArray, test_size=0.2, random_state=42, shuffle=True)

# X_train, X_test, y_train, y_test = train_test_split(x_1, y_1, test_size=0.2, random_state=42, shuffle=True)

svm_classifier = MultiOutputClassifier(RandomForestClassifier(random_state=0,n_estimators=1000))

svm_classifier.fit(X_train, y_train)

svm_predictions = svm_classifier.predict(X_test)

svm_accuracy = accuracy_score(y_test, svm_predictions)
print(f"Accuracy: {svm_accuracy}")

print(classification_report(y_test, svm_predictions))

Accuracy: 0.15252525252525254
              precision    recall  f1-score   support

           0       0.37      0.26      0.30       664
           1       0.28      0.17      0.21       456
           2       0.24      0.15      0.18       426
           3       0.22      0.10      0.14       328
           4       0.17      0.07      0.10       249
           5       0.17      0.06      0.09       276
           6       0.16      0.05      0.08       225
           7       0.19      0.08      0.11       235
           8       0.08      0.03      0.04       218

   micro avg       0.26      0.13      0.18      3077
   macro avg       0.21      0.11      0.14      3077
weighted avg       0.24      0.13      0.17      3077
 samples avg       0.14      0.11      0.11      3077



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [425]:
# Evaluate the model on the test set
from sklearn.datasets import make_multilabel_classification
from sklearn.linear_model import LogisticRegression

# X, y = make_multilabel_classification(n_classes=30, random_state=0)
# clf = MultiOutputClassifier(LogisticRegression()).fit(X, y)
# clf.predict(X[-2:])

In [192]:

# Print classification report


In [193]:
# Calculate accuracy for each class
svm1accuracies = []
for i in range(y_test.shape[1]):
    svm1class_accuracy = np.mean((y_test[:, i] == (y_pred[:, i] > 0.5).astype(int)))
    svm1accuracies.append(svm1class_accuracy)

# Print the accuracy for each class
for i, acc in enumerate(svm1accuracies):
    print(f"Accuracy for Class {i}: {acc:.2%}")

Accuracy for Class 0: 66.36%
Accuracy for Class 1: 76.87%
Accuracy for Class 2: 78.54%
Accuracy for Class 3: 83.43%
