In [1]:
# Code to create a model used to predict whether the individual patient has an allergy
# Use different notebook to load the model and return a prediction

import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import tensorflow as tf
import keras

import numpy as np
np.random.seed(123)




In [2]:
import matplotlib
from keras.models import * 
from keras.layers import *
from keras.optimizers import RMSprop
import pandas as pd

# Import both datasets, change to local path when running
patients = pd.read_excel(r"C:\Users\me\OneDrive\Desktop\Senior Design\Allergy_SanFrancisco\PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level2_AI_Patient Traits")

allergies = pd.read_excel(r"C:\Users\me\OneDrive\Desktop\Senior Design\Allergy_SanFrancisco\PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level1_Patient Allergens")

In [3]:
# Confirm that patient sheet imported correctly
# Comment below line before committing
# patients['SkinConditions'].head()

In [4]:
# Confirm allergy sheet imported correctly
# Comment line before committing
# allergies.head()

In [5]:
# Merge columns by ID if needed
patientAllergies = patients.merge(allergies, on = "SFM Id")
# Comment line before committing
# patientAllergies.head()

In [6]:
# Drop ID and location columns from dataframe
patientsTrimmed = patients.drop(['SFM Id', 'City', 'State', 'Country'], axis = 1)

In [7]:
# One-hot encode Gender column
patientsTrimmed['Gender'] = pd.Categorical(patientsTrimmed['Gender'].str.strip())
gender_onehot = pd.get_dummies(patientsTrimmed['Gender'], prefix = "Gender",
                                    prefix_sep = "-", dtype = int)
patientsTrimmed = patientsTrimmed.drop('Gender', axis = 1)
patientsTrimmed = patientsTrimmed.join(gender_onehot)

In [8]:
# One-hot encode SkinTone column
patientsTrimmed['SkinTone'] = pd.Categorical(patientsTrimmed['SkinTone'].str.strip())
skintone_onehot = pd.get_dummies(patientsTrimmed['SkinTone'], prefix = "SkinTone",
                                    prefix_sep = "-", dtype = int)
patientsTrimmed = patientsTrimmed.drop('SkinTone', axis = 1)
patientsTrimmed = patientsTrimmed.join(skintone_onehot)

In [9]:
# One-hot encode FitzPatrickSkinPhotoType column
patientsTrimmed['FitzpatrickSkinPhotoType'] = pd.Categorical(patientsTrimmed['FitzpatrickSkinPhotoType'].str.strip())
# Dropping first here since it is a blank variable in the column
fitzpatrick_onehot = pd.get_dummies(patientsTrimmed['FitzpatrickSkinPhotoType'], prefix = "Fitzpatrick",
                                    prefix_sep = "-", drop_first = True, dtype = int)
patientsTrimmed = patientsTrimmed.drop('FitzpatrickSkinPhotoType', axis = 1)
patientsTrimmed = patientsTrimmed.join(fitzpatrick_onehot)

In [10]:
# Switching to TextVectorization (Tokenizer is deprecated)
from keras.layers import TextVectorization
from urllib.parse import unquote

# decode from url-encoding to regular text
def decode_url(url):
    return unquote(url)

# Replace commas with whitespace
patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].apply(decode_url)
patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].str.replace(',', ' ')
patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].str.replace(':', ' ')

# Set the max length based on whitespace characters
max_len = patientsTrimmed['SkinConditions'].str.count(' ').max()
# Create TextVectorization object, separating on whitespace and using the max_len from earlier
vectorizer = TextVectorization(split = 'whitespace', output_sequence_length = max_len)




In [11]:
# Adapt using the column we want to convert
vectorizer.adapt(patientsTrimmed['SkinConditions'].values)
# Reset the column after converting values to vector and placing in array
skinConditions = vectorizer(patientsTrimmed['SkinConditions']).numpy()
patientsTrimmed = patientsTrimmed.drop('SkinConditions', axis = 1)
vectorizer.get_vocabulary()




['',
 '[UNK]',
 'allergiccontactdermatitis',
 'sensitiveskinallergistdiagnosed',
 'skinallergies',
 'eczemaatopicskin',
 'drychappedskin',
 'sensitiveskinselfdiagnosed',
 'finelineswrinkles',
 'acnepimples',
 'blackheadswhiteheads',
 'discolorationhyperpigmentation',
 'rosacea',
 'psoriasis',
 'other',
 'none',
 'dermatitis',
 'and',
 'allergy',
 'lichen',
 'hives',
 'to',
 'skin',
 'seborrheic',
 'allergies',
 'scalp',
 'rash',
 'of',
 'allergic',
 'dry',
 'nickel',
 'itchy',
 'i',
 'planus',
 'perioral',
 'on',
 'my',
 'contact',
 'balsam',
 'red',
 'pilaris',
 'itching',
 'wheat',
 'prurigo',
 'peru',
 'mix',
 'lupus',
 'keratosis',
 'gluten',
 'fragrance',
 'eye',
 'eczema',
 'dermatographia',
 'vitiligo',
 'soy',
 'sensitive',
 'lips',
 'face',
 'eyelid',
 'disease',
 'coconut',
 'cocamidopropyl',
 'celiac',
 'betaine',
 'urticaria',
 'the',
 'syndrome',
 'swelling',
 'sclerosis',
 'rashes',
 'propolis',
 'products',
 'ocular',
 'nodularis',
 'mouth',
 'legs',
 'latex',
 'inflamma

In [12]:
patientsArr = patientsTrimmed.values
input_data = np.concatenate((patientsArr, skinConditions), axis = 1)

In [13]:
# Drop ID column for preprocessing - ID should have no effect on prediction
allergiesNoId = allergies.drop('SFM Id', axis = 1)
# Remove all non-digit characters, then replace empty cells with NaN
allergiesNoId = allergiesNoId.replace(r'\D+', '', regex = True).replace('', np.nan)
# Set all NaN cells to 0
allergiesNoId = allergiesNoId.fillna(0)
# Convert entire dataframe to integer
allergiesNoId = allergiesNoId.astype(int)

In [14]:
allergiesNoId['AllergiesList'] = allergiesNoId.astype(str).apply(' '.join, axis=1)
allergiesNoId['AllergiesList'] = allergiesNoId['AllergiesList'].str.split()

In [15]:
from sklearn.preprocessing import MultiLabelBinarizer
# Create MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

allergiesArr = np.array(allergiesNoId['AllergiesList'])
# Multi-hot encode data
allergiesArray = mlb.fit_transform(allergiesArr)

In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, X_train_text, X_test_text, y_train, y_test = train_test_split(patientsArr, skinConditions, allergiesArray)

In [17]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, multilabel_confusion_matrix
from keras.layers import Average

cv = KFold(n_splits=5, shuffle=True, random_state=7869)

fold_count = 1
# Train the model for each split
# Define the model inside the for loop
for train, test in cv.split(input_data,allergiesArray):

    n_classes = 731

    # Input layer
    input_shape = (77,)
    inputs = keras.Input(input_shape)

    x1 = Dense(256, activation = 'sigmoid')(inputs)
    x1 = Dense(512, activation = 'sigmoid')(x1)
    predictions1 = Dense(n_classes, activation='sigmoid')(x1)
    
    model1 = keras.Model(inputs = inputs, outputs = predictions1)
    model1.compile(loss=keras.losses.BinaryFocalCrossentropy(apply_class_balancing=True, alpha = 0.35, gamma = 14), optimizer=keras.optimizers.Adam(learning_rate=0.1))
    
    x2 = Dense(128, activation = "sigmoid")(inputs)
    predictions2 = Dense(731, activation  ="sigmoid")(x2)
    model2 = keras.Model(inputs = inputs, outputs = predictions2)
    model2.compile(loss=keras.losses.BinaryFocalCrossentropy(apply_class_balancing=True, alpha = 0.2, gamma = 10), optimizer=keras.optimizers.Adam(learning_rate=0.001))
    
    x3 = Dense(385, activation = "sigmoid")(inputs)
    x3 = Dropout(0.4)(x3)
    x3 = Dense(233, activation = "sigmoid")(x3)
    predictions3 = Dense(731, activation  ="sigmoid")(x3)
    model3 = keras.Model(inputs = inputs, outputs = predictions3)
    model3.compile(loss=keras.losses.BinaryFocalCrossentropy(apply_class_balancing=True, alpha = 0.40, gamma = 13), optimizer=keras.optimizers.RMSprop(learning_rate=0.1))
    
    x4 = Dense(385, activation = "sigmoid")(inputs)
    predictions4 = Dense(731, activation  ="sigmoid")(x4)
    model4 = keras.Model(inputs = inputs, outputs = predictions4)
    model4.compile(loss=keras.losses.BinaryFocalCrossentropy(apply_class_balancing=True, alpha = 0.35, gamma = 17), optimizer=keras.optimizers.Adam(learning_rate=0.01))
    
    print("Running fold #: ", fold_count)

    fold_train_x = input_data[train]
    
    history1 = model1.fit(
    fold_train_x, allergiesArray[train],
    epochs = 20, 
    verbose = 0, 
    batch_size = 20, 
    validation_split=0.1
    )
    
    history2 = model2.fit(
    fold_train_x, allergiesArray[train],
    epochs = 20, 
    verbose = 0, 
    batch_size = 4, 
    validation_split=0.3
    )
        
    history3 = model3.fit(
    fold_train_x, allergiesArray[train],
    epochs = 20, 
    verbose = 0, 
    batch_size = 20, 
    validation_split=0.1
    )
            
    history4 = model4.fit(
    fold_train_x, allergiesArray[train],
    epochs = 20, 
    verbose = 0, 
    batch_size = 20, 
    validation_split=0.2
    )

    fold_test_x = input_data[test]
    y_true = allergiesArray[test]
    
    # pred_y = model.predict_classes(fold_test_x, verbose = 1)
    probs1 = model1.predict(fold_test_x, verbose = 0)
    probs2 = model2.predict(fold_test_x, verbose = 0)
    probs3 = model3.predict(fold_test_x, verbose = 0)
    probs4 = model4.predict(fold_test_x, verbose = 0)
    
    probs = (probs1 + probs2 + probs3 + probs4) / 4.0
    threshold = 0.425
    probs1 = (probs1 > threshold).astype(int)
    probs2 = (probs2 > threshold).astype(int)
    probs3 = (probs3 > threshold).astype(int)
    probs4 = (probs4 > threshold).astype(int)
    probs = (probs > threshold).astype(int)
    
    # AUC (Area Under Curve): how well the model can classify into the classes (high numbers better)
    auc = keras.metrics.AUC(multi_label = True, num_labels = 731, from_logits = False)
    # Precision: how well model predicts target class (high numbers better)\
    prec = keras.metrics.Precision()
    # Recall: how many objects model can find (high numbers better)
    rec = keras.metrics.Recall()
    # F1Score: mean between precision and recall
    f1 = keras.metrics.F1Score(average = 'weighted')

    auc.update_state(y_true, probs1)
    print("1 AUC: ", auc.result().numpy())
    prec.update_state(y_true, probs1)
    print("1 prec: ", prec.result().numpy())
    rec.update_state(y_true, probs1)
    print("1 rec: ", rec.result().numpy())
    f1.update_state(y_true, probs1)
    print("1 f1: ", f1.result().numpy())
    
    # AUC (Area Under Curve): how well the model can classify into the classes (high numbers better)
    auc = keras.metrics.AUC(multi_label = True, num_labels = 731, from_logits = False)
    # Precision: how well model predicts target class (high numbers better)\
    prec = keras.metrics.Precision()
    # Recall: how many objects model can find (high numbers better)
    rec = keras.metrics.Recall()
    # F1Score: mean between precision and recall
    f1 = keras.metrics.F1Score(average = 'weighted')

    auc.update_state(y_true, probs2)
    print("2 AUC: ", auc.result().numpy())
    prec.update_state(y_true, probs2)
    print("2 prec: ", prec.result().numpy())
    rec.update_state(y_true, probs2)
    print("2 rec: ", rec.result().numpy())
    f1.update_state(y_true, probs2)
    print("2 f1: ", f1.result().numpy())
    
    # AUC (Area Under Curve): how well the model can classify into the classes (high numbers better)
    auc = keras.metrics.AUC(multi_label = True, num_labels = 731, from_logits = False)
    # Precision: how well model predicts target class (high numbers better)
    prec = keras.metrics.Precision()
    # Recall: how many objects model can find (high numbers better)
    rec = keras.metrics.Recall()
    # F1Score: mean between precision and recall
    f1 = keras.metrics.F1Score(average = 'weighted')

    auc.update_state(y_true, probs3)
    print("3 AUC: ", auc.result().numpy())
    prec.update_state(y_true, probs3)
    print("3 prec: ", prec.result().numpy())
    rec.update_state(y_true, probs3)
    print("3 rec: ", rec.result().numpy())
    f1.update_state(y_true, probs3)
    print("3 f1: ", f1.result().numpy())
    
    # AUC (Area Under Curve): how well the model can classify into the classes (high numbers better)
    auc = keras.metrics.AUC(multi_label = True, num_labels = 731, from_logits = False)
    # Precision: how well model predicts target class (high numbers better)\
    prec = keras.metrics.Precision()
    # Recall: how many objects model can find (high numbers better)
    rec = keras.metrics.Recall()
    # F1Score: mean between precision and recall
    f1 = keras.metrics.F1Score(average = 'weighted')

    auc.update_state(y_true, probs4)
    print("4 AUC: ", auc.result().numpy())
    prec.update_state(y_true, probs4)
    print("4 prec: ", prec.result().numpy())
    rec.update_state(y_true, probs4)
    print("4 rec: ", rec.result().numpy())
    f1.update_state(y_true, probs4)
    print("4 f1: ", f1.result().numpy())
    
    
    # AUC (Area Under Curve): how well the model can classify into the classes (high numbers better)
    auc = keras.metrics.AUC(multi_label = True, num_labels = 731, from_logits = False)
    # Precision: how well model predicts target class (high numbers better)\
    prec = keras.metrics.Precision()
    # Recall: how many objects model can find (high numbers better)
    rec = keras.metrics.Recall()
    # F1Score: mean between precision and recall
    f1 = keras.metrics.F1Score(average = 'weighted')

    # print("Predicted y values: ", pred_y[0])
    print("X values for test: ", fold_test_x[0])
    yList = mlb.inverse_transform(y_true)
    print("Actual y values as list: ", yList[0])
    probsList = mlb.inverse_transform(probs)
    print("Predicted values as list: ", probsList[0])
    
    auc.update_state(y_true, probs)
    print("Mean AUC: ", auc.result().numpy())
    prec.update_state(y_true, probs)
    print("Mean prec: ", prec.result().numpy())
    rec.update_state(y_true, probs)
    print("Mean rec: ", rec.result().numpy())
    f1.update_state(y_true, probs)
    print("Mean f1: ", f1.result().numpy())
    
    # print("Accuracy score sklearn: ", accuracy_score(allergiesArray[test], probs))
    accuracy = (allergiesArray[test] == probs).all(axis=(0,1)).mean()
    # print("Accuracy score equation: ", accuracy)
    print("-------------------------")
    fold_count = fold_count + 1

Running fold #:  1
1 AUC:  0.3488372
1 prec:  0.061417546
1 rec:  0.7218702
1 f1:  0.2373232
2 AUC:  0.34890535
2 prec:  0.14540192
2 rec:  0.34516108
2 f1:  0.19692571
3 AUC:  0.34884405
3 prec:  0.07994891
3 rec:  0.7389448
3 f1:  0.2457467
4 AUC:  0.34883663
4 prec:  0.04390226
4 rec:  0.8812779
4 f1:  0.25075114
X values for test:  [1950    1    0    0    0    0    0    1    0    0    0    0    0    0
    0    0    0    3    5    2    6    4    8    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0]
Actual y values as list:  ('0', '124', '144962', '1462', '1590', '1600', '1607', '1827', '188', '5468', '5986', '9300')
Predicted values as list:  ('0', '10541', '124', '12522', '13891', '144962', '148092', '1586', '1589', '1590', '1592', '1598', '1604', '1605', '1607',

In [18]:
from sklearn.ensemble import RandomForestClassifier
forest = RandomForestClassifier()
X_train, X_test, y_train, y_test = train_test_split(input_data, allergiesArray)
forest.fit(X_train, y_train)
yPred = forest.predict(X_test)
# AUC (Area Under Curve): how well the model can classify into the classes (high numbers better)
auc = keras.metrics.AUC(multi_label = True, num_labels = 731, from_logits = False)
# Precision: how well model predicts target class (high numbers better)\
prec = keras.metrics.Precision()
# Recall: how many objects model can find (high numbers better)
rec = keras.metrics.Recall()
# F1Score: mean between precision and recall
f1 = keras.metrics.F1Score(average = 'weighted')

yPred = np.array(yPred)

yPred = (yPred > 0.425).astype(int)

# print("Predicted y values: ", pred_y[0])
print("X values for test: ", X_test[0])
yList = mlb.inverse_transform(y_test)
print("Actual y values as list: ", yList[0])
probsList = mlb.inverse_transform(yPred)
print("Predicted values as list: ", probsList[0])

auc.update_state(y_test, yPred)
print("Mean AUC: ", auc.result().numpy())
prec.update_state(y_test, yPred)
print("Mean prec: ", prec.result().numpy())
rec.update_state(y_test, yPred)
print("Mean rec: ", rec.result().numpy())
f1.update_state(y_test, yPred)
print("Mean f1: ", f1.result().numpy())

X values for test:  [2007    1    0    0    0    0    0    0    0    1    0    0    0    0
    0    0    0    5    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0]
Actual y values as list:  ('0', '9308')
Predicted values as list:  ('0', '201')
Mean AUC:  0.38168034
Mean prec:  0.39150134
Mean rec:  0.17884314
Mean f1:  0.19304252


In [23]:
# TODO: Save Keras model as separate file
model4.save("test_model.h5")

  saving_api.save_model(


In [21]:
print(tf.__version__)
print(keras.__version__)

2.15.0
2.15.0
