In [1]:
# Code to create a model used to predict whether the individual patient has an allergy
# Use different notebook to load the model and return a prediction

import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import tensorflow
import keras

import numpy as np
np.random.seed(123)




In [2]:
import matplotlib
from keras.models import * 
from keras.layers import *
from keras.optimizers import RMSprop
import pandas as pd

# Import both datasets, change to local path when running
patients = pd.read_excel(r"C:\Users\me\OneDrive\Desktop\Senior Design\Allergy_SanFrancisco\PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level2_AI_Patient Traits")

allergies = pd.read_excel(r"C:\Users\me\OneDrive\Desktop\Senior Design\Allergy_SanFrancisco\PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level1_Patient Allergens")

In [3]:
# Confirm that patient sheet imported correctly
# Comment below line before committing
# patients['SkinConditions'].head()

In [4]:
# Confirm allergy sheet imported correctly
# Comment line before committing
# allergies.head()

In [5]:
# Merge columns by ID if needed
patientAllergies = patients.merge(allergies, on = "SFM Id")
# Comment line before committing
# patientAllergies.head()

In [6]:
# Drop ID and location columns from dataframe
patientsTrimmed = patients.drop(['SFM Id', 'City', 'State', 'Country'], axis = 1)

In [7]:
# One-hot encode Gender column
patientsTrimmed['Gender'] = pd.Categorical(patientsTrimmed['Gender'].str.strip())
gender_onehot = pd.get_dummies(patientsTrimmed['Gender'], prefix = "Gender",
                                    prefix_sep = "-", dtype = int)
print(patientsTrimmed.head())
patientsTrimmed = patientsTrimmed.drop('Gender', axis = 1)
patientsTrimmed = patientsTrimmed.join(gender_onehot)
print(patientsTrimmed.head())

  Gender  BirthYear FitzpatrickSkinPhotoType SkinTone  \
0      M       2000                              dark   
1      F       1946                            medium   
2      F       1950                              fair   
3      M       1953                              fair   
4      F       1985                              fair   

                                      SkinConditions  
0                                               none  
1   sensitive-skin-allergist-diagnosed,sensitive-...  
2   sensitive-skin-allergist-diagnosed, eczema-at...  
3   sensitive-skin-allergist-diagnosed,skin-aller...  
4   sensitive-skin-allergist-diagnosed, sensitive...  
   BirthYear FitzpatrickSkinPhotoType SkinTone  \
0       2000                              dark   
1       1946                            medium   
2       1950                              fair   
3       1953                              fair   
4       1985                              fair   

                          

In [8]:
# One-hot encode SkinTone column
patientsTrimmed['SkinTone'] = pd.Categorical(patientsTrimmed['SkinTone'].str.strip())
skintone_onehot = pd.get_dummies(patientsTrimmed['SkinTone'], prefix = "SkinTone",
                                    prefix_sep = "-", dtype = int)
print(patientsTrimmed.head())
patientsTrimmed = patientsTrimmed.drop('SkinTone', axis = 1)
patientsTrimmed = patientsTrimmed.join(skintone_onehot)
print(patientsTrimmed.head())

   BirthYear FitzpatrickSkinPhotoType SkinTone  \
0       2000                              dark   
1       1946                            medium   
2       1950                              fair   
3       1953                              fair   
4       1985                              fair   

                                      SkinConditions  Gender-F  Gender-M  \
0                                               none         0         1   
1   sensitive-skin-allergist-diagnosed,sensitive-...         1         0   
2   sensitive-skin-allergist-diagnosed, eczema-at...         1         0   
3   sensitive-skin-allergist-diagnosed,skin-aller...         0         1   
4   sensitive-skin-allergist-diagnosed, sensitive...         1         0   

   Gender-Other  Gender-Undisclosed  
0             0                   0  
1             0                   0  
2             0                   0  
3             0                   0  
4             0                   0  
   BirthYear F

In [9]:
# One-hot encode FitzPatrickSkinPhotoType column
patientsTrimmed['FitzpatrickSkinPhotoType'] = pd.Categorical(patientsTrimmed['FitzpatrickSkinPhotoType'].str.strip())
# Dropping first here since it is a blank variable in the column
fitzpatrick_onehot = pd.get_dummies(patientsTrimmed['FitzpatrickSkinPhotoType'], prefix = "Fitzpatrick",
                                    prefix_sep = "-", drop_first = True, dtype = int)
print(patientsTrimmed.head())
patientsTrimmed = patientsTrimmed.drop('FitzpatrickSkinPhotoType', axis = 1)
patientsTrimmed = patientsTrimmed.join(fitzpatrick_onehot)
print(patientsTrimmed.head())

   BirthYear FitzpatrickSkinPhotoType  \
0       2000                            
1       1946                            
2       1950                            
3       1953                            
4       1985                            

                                      SkinConditions  Gender-F  Gender-M  \
0                                               none         0         1   
1   sensitive-skin-allergist-diagnosed,sensitive-...         1         0   
2   sensitive-skin-allergist-diagnosed, eczema-at...         1         0   
3   sensitive-skin-allergist-diagnosed,skin-aller...         0         1   
4   sensitive-skin-allergist-diagnosed, sensitive...         1         0   

   Gender-Other  Gender-Undisclosed  SkinTone-brown  SkinTone-dark  \
0             0                   0               0              1   
1             0                   0               0              0   
2             0                   0               0              0   
3             0 

In [10]:
# Switching to TextVectorization (Tokenizer is deprecated)
from keras.layers import TextVectorization
# Replace commas with whitespace
patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].str.replace(',', ' ')
# Set the max length based on whitespace characters
max_len = patientsTrimmed['SkinConditions'].str.count(' ').max()
# Create TextVectorization object, separating on whitespace and using the max_len from earlier
vectorizer = TextVectorization(split = 'whitespace', output_sequence_length = max_len)




In [11]:
# Adapt using the column we want to convert
print(patientsTrimmed['SkinConditions'].head())
vectorizer.adapt(patientsTrimmed['SkinConditions'].values)
print(patientsTrimmed['SkinConditions'].head())
# Reset the column after converting values to vector and placing in array
skinConditions = vectorizer(patientsTrimmed['SkinConditions']).numpy()
patientsTrimmed = patientsTrimmed.drop('SkinConditions', axis = 1)

0                                                 none
1     sensitive-skin-allergist-diagnosed sensitive-...
2     sensitive-skin-allergist-diagnosed  eczema-at...
3     sensitive-skin-allergist-diagnosed skin-aller...
4     sensitive-skin-allergist-diagnosed  sensitive...
Name: SkinConditions, dtype: object

0                                                 none
1     sensitive-skin-allergist-diagnosed sensitive-...
2     sensitive-skin-allergist-diagnosed  eczema-at...
3     sensitive-skin-allergist-diagnosed skin-aller...
4     sensitive-skin-allergist-diagnosed  sensitive...
Name: SkinConditions, dtype: object


In [12]:
patientsArr = patientsTrimmed.values
input_data = np.concatenate((patientsArr, skinConditions), axis = 1)

In [13]:
# Drop ID column for preprocessing - ID should have no effect on prediction
allergiesNoId = allergies.drop('SFM Id', axis = 1)
# Remove all non-digit characters, then replace empty cells with NaN
allergiesNoId = allergiesNoId.replace(r'\D+', '', regex = True).replace('', np.nan)
# Set all NaN cells to 0
allergiesNoId = allergiesNoId.fillna(0)
# Convert entire dataframe to integer
allergiesNoId = allergiesNoId.astype(int)

In [14]:
allergiesNoId['AllergiesList'] = allergiesNoId.astype(str).apply(' '.join, axis=1)
allergiesNoId['AllergiesList'] = allergiesNoId['AllergiesList'].str.split()

In [15]:
allergiesArr = np.array(allergiesNoId['AllergiesList'])
allergiesArr = [[int(num) for num in row] for row in allergiesArr]

from collections import Counter

# Flatten the 2D array
flattened_array = [val for row in allergiesArr for val in row]

# Count the frequency of each value
value_counts = Counter(flattened_array)

# Get the top 10 most frequent values
top_11_values = [value for value, _ in value_counts.most_common(11)]
print(top_11_values)

# Replace values not in the top 10 with 0
for row in allergiesArr:
    for i in range(len(row)):
        if row[i] not in top_11_values:
            row[i] = 0

# Limit row length to 10
for i in range(len(allergiesArr)):
    allergiesArr[i] = allergiesArr[i][:11]


[0, 1605, 9314, 7086, 201, 3793, 9308, 2023, 61707, 28559, 44465]


In [16]:
from sklearn.preprocessing import MultiLabelBinarizer
# Create MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

# Multi-hot encode data
allergiesArray = mlb.fit_transform(allergiesArr)

In [17]:
allergiesNew = mlb.inverse_transform(allergiesArray)

In [18]:
allergiesDF = pd.DataFrame(mlb.transform(allergiesArr), columns = mlb.classes_)
allergiesDF.dtypes

0        int32
201      int32
1605     int32
2023     int32
3793     int32
7086     int32
9308     int32
9314     int32
28559    int32
44465    int32
61707    int32
dtype: object

In [19]:
from sklearn.model_selection import KFold
from sklearn.metrics import multilabel_confusion_matrix, precision_score, recall_score, f1_score
from keras.layers import Dense, Dropout, BatchNormalization, Embedding, Attention, Input
import keras
import numpy as np
from keras import regularizers
from keras.callbacks import EarlyStopping


# Define hyperparameters
learning_rate = 0.01
epochs = 20
batch_size = 32
dropout_rate = 0.3  # Adjust dropout rate as needed

cv = KFold(n_splits=5, shuffle=True, random_state=345)

early_stopping = EarlyStopping(monitor='val_loss', patience = 5, verbose=1)

fold_count = 1
# Train the model for each split
for train, test in cv.split(patientsArr, skinConditions, allergiesArray):

    n_classes = 11

    # Define input layers
    fixed_order_input = Input(shape=(17,))
    variable_order_input = Input(shape=(23,))

    # Define embedding layer for variable order part
    embedding_dim = 64
    embedded_variable_order = Embedding(input_dim=282, output_dim=embedding_dim)(variable_order_input)
    pooled_variable_order = GlobalMaxPooling1D()(embedded_variable_order)

    # Define layers for fixed order part
    fixed_order_output = Dense(64, activation='sigmoid')(fixed_order_input)

    # Concatenate outputs
    concatenated = Concatenate()([fixed_order_output, pooled_variable_order])

    # Additional dense layers
    dense_layer = Dense(128, activation='sigmoid')(concatenated)

    # Output layer
    predictions = Dense(n_classes, activation='sigmoid')(dense_layer)

    # Compile the model with adjusted hyperparameters
    model = keras.Model(inputs=[fixed_order_input, variable_order_input], outputs=predictions)
    model.compile(loss=keras.losses.BinaryFocalCrossentropy(apply_class_balancing=True, alpha = 0.4, gamma = 1),  # Use binary cross-entropy for binary classification
                  optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
                  metrics=[keras.metrics.Precision(), keras.metrics.Recall()])

    print("Running fold #: ", fold_count)

    # Train the model
    history = model.fit(
        [patientsArr[train], skinConditions[train]], allergiesArray[train],  # Slice the target array to include only the top allergies
        epochs=epochs,
        batch_size=batch_size,
        verbose=1,
        validation_split=0.2,
        callbacks=[early_stopping]
    )

    # Evaluate the model
    probs = model.predict([patientsArr[test], skinConditions[test]], verbose=0)

    # Dynamic threshold optimization based on F1 score
    best_f1 = 0
    optimal_threshold = 0
    for t in np.arange(0.1, 1, 0.1):
        y_pred = (probs > t).astype(int)
        f1 = f1_score(allergiesArray[test][:, :n_classes], y_pred, average='samples', zero_division=1)  # Use 'samples' average for multilabel classification
        if f1 > best_f1:
            best_f1 = f1
            optimal_threshold = t

    print("F1 score:", best_f1)
    print("Optimal threshold:", optimal_threshold)
    
    print(input_data[test][0])
    print(input_data[test][1])
    print(input_data[test][2])
    print(input_data[test][3])
    print(input_data[test][4])

    # Apply threshold
    y_pred = (probs > 0.2).astype(int)
    prod_convert = mlb.inverse_transform(y_pred)
    print("Test prod")
    print(prod_convert[0])
    print(prod_convert[1])
    print(prod_convert[2])
    print(prod_convert[3])
    print(prod_convert[4])
    allConvert = mlb.inverse_transform(allergiesArray[test])
    print("Actual Allergies")
    print(allConvert[0])
    print(allConvert[1])
    print(allConvert[2])
    print(allConvert[3])
    print(allConvert[4])

    # Compute precision and recall
    precision = precision_score(allergiesArray[test][:, :n_classes], y_pred, average='samples', zero_division=1)  # Use 'samples' average for multilabel classification
    recall = recall_score(allergiesArray[test][:, :n_classes], y_pred, average='samples', zero_division=1)  # Use 'samples' average for multilabel classification

    print("Precision:", precision)
    print("Recall:", recall)

    # Compute multilabel confusion matrix
    matrix = multilabel_confusion_matrix(allergiesArray[test][:, :n_classes], y_pred)
    print(matrix)

    fold_count += 1


Running fold #:  1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 10: early stopping
F1 score: 0.6467877184543851
Optimal threshold: 0.9
[1950    1    0    0    0    0    0    1    0    0    0    0    0    0
    0    0    0    3    5    2    6    4    8    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[1977    1    0    0    0    0    0    0    0    0    1    0    0    0
    0    0    0    3    9    5    2    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[1982    1    0    0    0    0    0    1    0    0    0    0    0    0
    0    0    0    3    5    2    6    4    8    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[1985    1    0    0    0    0    0    0    1    0    0    0    0    0
    0    0    0    3    2    4    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    

Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 14: early stopping
F1 score: 0.6520574795574796
Optimal threshold: 0.8
[1953    0    1    0    0    0    0    1    0    0    0    0    0    0
    0    0    0    3    4    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[1984    1    0    0    0    0    0    0    0    0    1    0    0    0
    0    0    0    3    7    5    2    6    4    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[1981    1    0    0    0    0    0    0    1    0    0    0    0    0
    0    0    0    3    9    2   11   12    6    8    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[1946    1    0    0    0    0    0    1    0    0    0    0    0    0
    0    0    0    3    2   13    8    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[1979    1    0    0    0    0    0 

Epoch 10/20
Epoch 10: early stopping
F1 score: 0.6488305782141052
Optimal threshold: 0.8
[1985    1    0    0    0    0    0    1    0    0    0    0    0    0
    0    0    0    3    7    5    2    4    8    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[1948    1    0    0    0    0    0    1    0    0    0    0    0    0
    0    0    0    3    7    5    2    6    4    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[1958    1    0    0    0    0    0    1    0    0    0    0    0    0
    0    0    0    3   10    2   12    4    8    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[1995    1    0    0    0    0    0    0    1    0    0    0    0    0
    0    0    0    3    7    9    5    2    6    4    8    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]
[1944    1    0    0    0    0    0    1    0    0    0    0    0    0
    0    0    0    5    

In [20]:
# TODO: Save Keras model as separate file
model.save("model.h5")
vocab = vectorizer.get_vocabulary()
np.save("vocab.npy", vocab)

  saving_api.save_model(


In [21]:
print(patients['FitzpatrickSkinPhotoType'].unique())
print(patients['Gender'].unique())
print(patients['SkinTone'].unique())

[' ' ' darker-white' ' white-fair' ' light-pale-white' ' brown'
 ' dark-brown-black' ' light-brown']
[' M' ' F' ' Undisclosed' ' Other']
[' dark' ' medium' ' fair' ' olive' ' light' ' brown']


In [22]:
print(input_data[0])

[2000    0    1    0    0    0    1    0    0    0    0    0    0    0
    0    0    0   14    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0]


In [23]:
import joblib
joblib.dump(mlb, "mlb.pkl")

['mlb.pkl']

In [24]:
mlb_new = joblib.load("mlb.pkl")
mlb_new.classes_

array([    0,   201,  1605,  2023,  3793,  7086,  9308,  9314, 28559,
       44465, 61707])