In [4]:
# Code to create a model used to predict whether the individual patient has an allergy
# Use different notebook to load the model and return a prediction

import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import tensorflow
import keras

import numpy as np
np.random.seed(123)

In [5]:
import matplotlib
from keras.models import * 
from keras.layers import *
from keras.optimizers import RMSprop
import pandas as pd

# Import both datasets, change to local path when running
patients = pd.read_excel(r"C:\Users\me\OneDrive\Desktop\Senior Design\Allergy_SanFrancisco\PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level2_AI_Patient Traits")

allergies = pd.read_excel(r"C:\Users\me\OneDrive\Desktop\Senior Design\Allergy_SanFrancisco\PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level1_Patient Allergens")

In [6]:
# Confirm that patient sheet imported correctly
# Comment below line before committing
# patients['SkinConditions'].head()

In [7]:
# Confirm allergy sheet imported correctly
# Comment line before committing
# allergies.head()

In [8]:
# Merge columns by ID if needed
patientAllergies = patients.merge(allergies, on = "SFM Id")
# Comment line before committing
# patientAllergies.head()

In [9]:
# Drop ID and location columns from dataframe
patientsTrimmed = patients.drop(['SFM Id', 'City', 'State', 'Country'], axis = 1)

In [10]:
# One-hot encode Gender column
patientsTrimmed['Gender'] = pd.Categorical(patientsTrimmed['Gender'].str.strip())
gender_onehot = pd.get_dummies(patientsTrimmed['Gender'], prefix = "Gender",
                                    prefix_sep = "-", dtype = int)
patientsTrimmed = patientsTrimmed.drop('Gender', axis = 1)
patientsTrimmed = patientsTrimmed.join(gender_onehot)

In [11]:
# One-hot encode SkinTone column
patientsTrimmed['SkinTone'] = pd.Categorical(patientsTrimmed['SkinTone'].str.strip())
skintone_onehot = pd.get_dummies(patientsTrimmed['SkinTone'], prefix = "SkinTone",
                                    prefix_sep = "-", dtype = int)
patientsTrimmed = patientsTrimmed.drop('SkinTone', axis = 1)
patientsTrimmed = patientsTrimmed.join(skintone_onehot)

In [12]:
# One-hot encode FitzPatrickSkinPhotoType column
patientsTrimmed['FitzpatrickSkinPhotoType'] = pd.Categorical(patientsTrimmed['FitzpatrickSkinPhotoType'].str.strip())
# Dropping first here since it is a blank variable in the column
fitzpatrick_onehot = pd.get_dummies(patientsTrimmed['FitzpatrickSkinPhotoType'], prefix = "Fitzpatrick",
                                    prefix_sep = "-", drop_first = True, dtype = int)
patientsTrimmed = patientsTrimmed.drop('FitzpatrickSkinPhotoType', axis = 1)
patientsTrimmed = patientsTrimmed.join(fitzpatrick_onehot)

In [13]:
# Switching to TextVectorization (Tokenizer is deprecated)
from keras.layers import TextVectorization
# Replace commas with whitespace
patientsTrimmed['SkinConditions'] = patientsTrimmed['SkinConditions'].str.replace(',', ' ')
# Set the max length based on whitespace characters
max_len = patientsTrimmed['SkinConditions'].str.count(' ').max()
# Create TextVectorization object, separating on whitespace and using the max_len from earlier
vectorizer = TextVectorization(split = 'whitespace', output_sequence_length = max_len)




In [14]:
# Adapt using the column we want to convert
vectorizer.adapt(patientsTrimmed['SkinConditions'].values)
# Reset the column after converting values to vector and placing in array
skinConditions = vectorizer(patientsTrimmed['SkinConditions']).numpy()
patientsTrimmed = patientsTrimmed.drop('SkinConditions', axis = 1)




In [15]:
patientsArr = patientsTrimmed.values
input_data = np.concatenate((patientsArr, skinConditions), axis = 1)

In [16]:
# Drop ID column for preprocessing - ID should have no effect on prediction
allergiesNoId = allergies.drop('SFM Id', axis = 1)
# Remove all non-digit characters, then replace empty cells with NaN
allergiesNoId = allergiesNoId.replace(r'\D+', '', regex = True).replace('', np.nan)
# Set all NaN cells to 0
allergiesNoId = allergiesNoId.fillna(0)
# Convert entire dataframe to integer
allergiesNoId = allergiesNoId.astype(int)

In [17]:
allergiesNoId['AllergiesList'] = allergiesNoId.astype(str).apply(' '.join, axis=1)
allergiesNoId['AllergiesList'] = allergiesNoId['AllergiesList'].str.split()

In [18]:
from sklearn.preprocessing import MultiLabelBinarizer
# Create MultiLabelBinarizer object
mlb = MultiLabelBinarizer()

allergiesArr = np.array(allergiesNoId['AllergiesList'])
# Multi-hot encode data
allergiesArray = mlb.fit_transform(allergiesArr)

In [19]:
allergiesNew = mlb.inverse_transform(allergiesArray)

In [20]:
allergiesDF = pd.DataFrame(mlb.transform(allergiesArr), columns = mlb.classes_)
allergiesDF.dtypes

0         int32
100612    int32
100613    int32
100702    int32
100857    int32
          ...  
9804      int32
98288     int32
99100     int32
9926      int32
99356     int32
Length: 731, dtype: object

In [21]:
from sklearn.model_selection import KFold
from sklearn.metrics import multilabel_confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score
from keras.layers import Dense
import keras
import numpy as np
    
cv = KFold(n_splits=5, shuffle=True, random_state=7869)

fold_count = 1
# Train the model for each split
# Define the model inside the for loop
for train, test in cv.split(input_data, allergiesArray):

    n_classes = 731

    # Input layer
    input_shape = (40,)
    inputs = keras.Input(input_shape)

    # Hidden layers
    x = Dense(256, activation='sigmoid')(inputs)
    x = Dense(512, activation='sigmoid')(x)

    # Output layer - use multilabel classification
    predictions = Dense(n_classes, activation='sigmoid')(x)

    model = keras.Model(inputs=inputs, outputs=predictions)
    model.compile(loss=keras.losses.BinaryFocalCrossentropy(apply_class_balancing=True, alpha=0.35, gamma=14),
                  optimizer=keras.optimizers.Adam(learning_rate=0.1),
                  metrics=[keras.metrics.Precision(), keras.metrics.Recall()])

    print("Running fold #: ", fold_count)

    fold_train_x = input_data[train]

    history = model.fit(
        fold_train_x, allergiesArray[train],
        epochs=20,
        verbose=1,
        validation_split=0.2
    )

    fold_test_x = input_data[test]
    y_true = allergiesArray[test]

    # Predict probabilities
    probs = model.predict(fold_test_x, verbose=1)

    # Dynamic threshold optimization based on F1 score
    best_f1 = 0
    optimal_threshold = 0
    for t in np.arange(0.1, 1, 0.1):
        y_pred = (probs > t).astype(int)
        f1 = f1_score(y_true, y_pred, average='weighted', zero_division=1)
        if f1 > best_f1:
            best_f1 = f1
            optimal_threshold = t

    print("F1 score:", best_f1)
    print("Optimal threshold:", optimal_threshold)

    # Apply threshold
    y_pred = (probs > optimal_threshold).astype(int)

    # Compute precision and recall
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)

    print("Precision:", precision)
    print("Recall:", recall)

    # Calculate the AUC
    auc = keras.metrics.AUC(multi_label=True, num_labels=731, from_logits=False)
    auc.update_state(y_true, probs)
    print("AUC:", auc.result().numpy())
    
    # Compute multilabel confusion matrix
    matrix = multilabel_confusion_matrix(y_true, y_pred)
    print(matrix)


    fold_count += 1

Running fold #:  1
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
F1 score: 0.25243675085463935
Optimal threshold: 0.1
Precision: 0.22149592368694615
Recall: 0.9802574534782899
AUC: 0.3488372
[[[   0    0]
  [   0 1980]]

 [[   0 1980]
  [   0    0]]

 [[1980    0]
  [   0    0]]

 ...

 [[   0 1978]
  [   0    2]]

 [[1979    0]
  [   1    0]]

 [[1980    0]
  [   0    0]]]
Running fold #:  2
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
F1 score: 0.24996813532770537
Optimal threshold: 0.1
Precision: 0.22158639971139973
Recall: 0.9791219119226638
AUC: 0.3508892
[[[   0    0]
  [   0 1980]]

 [[1978    0]
  [   2    0]]

 [

Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
F1 score: 0.2551875100690555
Optimal threshold: 0.1
Precision: 0.22308004342426233
Recall: 0.9813972435331658
AUC: 0.3248974
[[[   0    0]
  [   0 1980]]

 [[   0 1980]
  [   0    0]]

 [[   0 1978]
  [   0    2]]

 ...

 [[   0 1979]
  [   0    1]]

 [[1978    0]
  [   2    0]]

 [[   0 1980]
  [   0    0]]]
Running fold #:  4
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20


Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
F1 score: 0.25074299643038894
Optimal threshold: 0.1
Precision: 0.23237252076910592
Recall: 0.970342522974102
AUC: 0.34336525
[[[   0    0]
  [   0 1979]]

 [[   0 1979]
  [   0    0]]

 [[   0 1977]
  [   0    2]]

 ...

 [[   0 1976]
  [   0    3]]

 [[   0 1979]
  [   0    0]]

 [[   0 1979]
  [   0    0]]]
Running fold #:  5
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
F1 score: 0.24946138127407438
Optimal threshold: 0.1
Precision: 0.22033214829544764
Recall: 0.9799919159256265
AUC: 0.35430917
[[[   0    0]
  [   0 1979]]

 [[1978    0]
  [   1    0]]

 [[   0 1977]
  [   0    2]]

 ...

 [[   0 1979]
  [   0    0]]

 [[   0 1979]
  [   0    0]]

 [[   0 1979]
  [   0    0]]]


In [23]:
# TODO: Save Keras model as separate file
model.save("model.h5")
vocab = vectorizer.get_vocabulary()
np.save("vocab.npy", vocab)

  saving_api.save_model(
