In [21]:
# Code to create a model used to predict whether the individual patient has an allergy
# Use different notebook to load the model and return a prediction

import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import tensorflow
import keras

import numpy as np
np.random.seed(123)

In [22]:
import matplotlib
from keras.models import * 
from keras.layers import *
from keras.optimizers import RMSprop
import pandas as pd

# Import both datasets, change to local path when running
patients = pd.read_excel(r"C:\Users\me\OneDrive\Desktop\Senior Design\Allergy_SanFrancisco\PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level2_AI_Patient Traits")

allergies = pd.read_excel(r"C:\Users\me\OneDrive\Desktop\Senior Design\Allergy_SanFrancisco\PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level1_Patient Allergens")

In [23]:
# Confirm that patient sheet imported correctly
# Comment below line before committing
# patients.head()

In [24]:
# Confirm allergy sheet imported correctly
# Comment line before committing
# allergies.head()

In [25]:
# Merge columns by ID if needed
patientAllergies = patients.merge(allergies, on = "SFM Id")
# Comment line before committing
# patientAllergies.head()

In [26]:
# Drop ID and location columns from dataframe
patientsTrimmed = patients.drop(['SFM Id', 'City', 'State', 'Country'], axis = 1)

In [27]:
# One-hot encode Gender column
patientsTrimmed['Gender'] = pd.Categorical(patientsTrimmed['Gender'].str.strip())
gender_onehot = pd.get_dummies(patientsTrimmed['Gender'], prefix = "Gender",
                                    prefix_sep = "-", dtype = int)
patientsTrimmed = patientsTrimmed.drop('Gender', axis = 1)
patientsTrimmed = patientsTrimmed.join(gender_onehot)

In [28]:
# One-hot encode SkinTone column
patientsTrimmed['SkinTone'] = pd.Categorical(patientsTrimmed['SkinTone'].str.strip())
skintone_onehot = pd.get_dummies(patientsTrimmed['SkinTone'], prefix = "SkinTone",
                                    prefix_sep = "-", dtype = int)
patientsTrimmed = patientsTrimmed.drop('SkinTone', axis = 1)
patientsTrimmed = patientsTrimmed.join(skintone_onehot)

In [29]:
# One-hot encode FitzPatrickSkinPhotoType column
patientsTrimmed['FitzpatrickSkinPhotoType'] = pd.Categorical(patientsTrimmed['FitzpatrickSkinPhotoType'].str.strip())
# Dropping first here since it is a blank variable in the column
fitzpatrick_onehot = pd.get_dummies(patientsTrimmed['FitzpatrickSkinPhotoType'], prefix = "Fitzpatrick",
                                    prefix_sep = "-", drop_first = True, dtype = int)
patientsTrimmed = patientsTrimmed.drop('FitzpatrickSkinPhotoType', axis = 1)
patientsTrimmed = patientsTrimmed.join(fitzpatrick_onehot)

In [30]:
# Switching to TextVectorization (Tokenizer is deprecated)
from keras.layers import TextVectorization
# Replace commas with whitespace
patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].str.replace(',', ' ')
# Set the max length based on whitespace characters
max_len = patientsTrimmed['SkinConditions'].str.count(' ').max()
# Create TextVectorization object, separating on whitespace and using the max_len from earlier
vectorizer = TextVectorization(split = 'whitespace', output_sequence_length = max_len)

In [31]:
# Adapt using the column we want to convert
vectorizer.adapt(patientsTrimmed['SkinConditions'].values)
# Reset the column after converting values to vector and placing in array
skinConditions = vectorizer(patientsTrimmed['SkinConditions']).numpy()
patientsTrimmed = patientsTrimmed.drop('SkinConditions', axis = 1)

In [32]:
patientsArr = patientsTrimmed.values
input_data = np.concatenate((patientsArr, skinConditions), axis = 1)

In [None]:
''' Old code for SkinConditions conversion - leaving in case we need later
# Code to preprocess the SkinConditions column, tokenizing should be more recognizable to keras than regular text
## Delete if this idea does not work
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Flatten entries into actual lists
patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].apply(lambda x: ' '.join(x))
# Tokenize the characters of the columns
tokenizer = Tokenizer()
tokenizer.fit_on_texts(patientsTrimmed['SkinConditions'])
# Create and pad sequences of the tokens
patientsTrimmed['SkinConditions'] = tokenizer.texts_to_sequences(patientsTrimmed['SkinConditions'])
max_length = max(len(seq) for seq in patientsTrimmed['SkinConditions'])
testVar = pad_sequences(patientsTrimmed['SkinConditions'], maxlen = max_length, padding = "post")
patientsTrimmed['SkinConditions'] = testVar.tolist()
'''

In [34]:
# Drop ID column for preprocessing - ID should have no effect on prediction
allergiesNoId = allergies.drop('SFM Id', axis = 1)
# Remove all non-digit characters, then replace empty cells with NaN
allergiesNoId = allergiesNoId.replace(r'\D+', '', regex = True).replace('', np.nan)
# Set all NaN cells to 0
allergiesNoId = allergiesNoId.fillna(0)
# Convert entire dataframe to integer
allergiesNoId = allergiesNoId.astype(int)

In [35]:
allergiesNoId['AllergiesList'] = allergiesNoId.astype(str).apply(' '.join, axis=1)
allergiesNoId['AllergiesList'] = allergiesNoId['AllergiesList'].str.split()

In [37]:
from sklearn.preprocessing import MultiLabelBinarizer
# Create MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

allergiesArray = np.array(allergiesNoId['AllergiesList'])
# One-hot encode data
allergiesArray = mlb.fit_transform(allergiesArray)

In [38]:
allergiesNew = mlb.inverse_transform(allergiesArray)

In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

cv = KFold(n_splits=20, shuffle=True, random_state=1023)


fold_count = 1
# Save auc, precision, and recall for each fold
# AUC (Area Under Curve): how well the model can classify into the classes (high numbers better)
auc_per_fold = []
# Precision: how well model predicts target class (high numbers better)
prec_per_fold = []
# Recall: how many objects model can find (high numbers better)
rec_per_fold = []
# Train the model for each split
# Define the model inside the for loop
for train, test in cv.split(input_data,allergiesArray):

    n_classes = 731

    # Input layer
    input_shape = (40,)
    inputs = keras.Input(input_shape)

    # Hidden layers
    x = Dense(128, activation='relu')(inputs)
    x = Dense(1024, activation='relu')(x)


    # Output layer - use multilabel classification
    predictions = Dense(n_classes, activation='sigmoid')(x)
    
    #Tie model together
    model = keras.Model(inputs = inputs, outputs = predictions)
    model.compile(loss='binary_crossentropy', optimizer='sgd', metrics = [keras.metrics.AUC(), keras.metrics.Precision(), keras.metrics.Recall()])
    
    print("Running fold #: ", fold_count)

    fold_train_x =input_data[train]
    
    history = model.fit(
    fold_train_x, allergiesArray[train], 
    epochs = 20, 
    verbose = 1,
    validation_split=0.2
    )

    # TODO Save the model

    fold_test_x = input_data[test]

    # Eval the model and report accuracy into a list
    scores = model.evaluate(x=fold_test_x, y=allergiesArray[test], verbose = 0)
    auc_per_fold.append(scores[1] * 100)
    prec_per_fold.append(scores[2] * 100)
    rec_per_fold.append(scores[3] * 100)
    
    # pred_y = model.predict_classes(fold_test_x, verbose = 1)
    probs = model.predict(fold_test_x, verbose = 1)
    # make human readable
    probs[probs>=0.5] = 1
    probs[probs<0.5] = 0
    # print("Predicted y values: ", pred_y[0])
    print("X values for test: ", fold_test_x[0])
    yList = mlb.inverse_transform(allergiesArray[test])
    print("Actual y values as list: ", yList[0])
    probsList = mlb.inverse_transform(probs)
    print("Predicted values as list: ", probsList[0])
    # print("Accuracy score sklearn: ", accuracy_score(allergiesArray[test], probs))
    accuracy = (allergiesArray[test] == probs).all(axis=(0,1)).mean()
    # print("Accuracy score equation: ", accuracy)

    fold_count = fold_count + 1

print("\n\n\n")
i = 0
while i < len(auc_per_fold):
    print("The area under curve for this fold is: ", auc_per_fold[i])
    print("The precision for this fold is: ", prec_per_fold[i])
    print("The recall for this fold is: ", rec_per_fold[i])
    i = i + 1

from statistics import mean
print("Mean area under curve: ", mean(auc_per_fold))
print("Mean precision: ", mean(prec_per_fold))
print("Mean recall: ", mean(rec_per_fold))

In [None]:
# TODO: Save Keras model as separate file