In [274]:
# Code to create a model used to predict whether the individual patient has an allergy
# Use different notebook to load the model and return a prediction

import os
os.environ["KERAS_BACKEND"] = "tensorflow"
import tensorflow
import keras

import numpy as np
np.random.seed(123)

In [275]:
import matplotlib
from keras.models import * 
from keras.layers import *
from keras.optimizers import RMSprop
import pandas as pd

# Import both datasets, change to local path when running
patients = pd.read_excel(r"C:\Users\me\OneDrive\Desktop\Senior Design\Allergy_SanFrancisco\PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level2_AI_Patient Traits")

allergies = pd.read_excel(r"C:\Users\me\OneDrive\Desktop\Senior Design\Allergy_SanFrancisco\PATIENTS_Nov_3_2023_V4_sfm-data.xlsx", sheet_name="Level1_Patient Allergens")

In [276]:
# Confirm that patient sheet imported correctly
# Comment below line before committing
# patients.head()

In [277]:
# Confirm allergy sheet imported correctly
# Comment line before committing
# allergies.head()

In [278]:
# Merge columns by ID if needed
patientAllergies = patients.merge(allergies, on = "SFM Id")
# Comment line before committing
# patientAllergies.head()

In [279]:
# Drop ID column for preprocessing - ID is arbitrary so any effect is purely coincidence
patientsNoId = patients.drop('SFM Id', axis = 1)
# Cities column likely too niche to have an effect on the results
patientsNoId = patientsNoId.drop('City', axis = 1)

In [280]:
# One-hot encode Gender column
patientsNoId['Gender'] = pd.Categorical(patientsNoId['Gender'].str.strip())
gender_onehot = pd.get_dummies(patientsNoId['Gender'], prefix = "Gender",
                                    prefix_sep = "-", dtype = int)
patientsNoId = patientsNoId.drop('Gender', axis = 1)
patientsNoId = patientsNoId.join(gender_onehot)

In [281]:
# One-hot encode SkinTone column
patientsNoId['SkinTone'] = pd.Categorical(patientsNoId['SkinTone'].str.strip())
skintone_onehot = pd.get_dummies(patientsNoId['SkinTone'], prefix = "SkinTone",
                                    prefix_sep = "-", dtype = int)
patientsNoId = patientsNoId.drop('SkinTone', axis = 1)
patientsNoId = patientsNoId.join(skintone_onehot)

In [282]:
# One-hot encode FitzPatrickSkinPhotoType column
patientsNoId['FitzpatrickSkinPhotoType'] = pd.Categorical(patientsNoId['FitzpatrickSkinPhotoType'].str.strip())
# Dropping first here since it is a blank variable in the column
fitzpatrick_onehot = pd.get_dummies(patientsNoId['FitzpatrickSkinPhotoType'], prefix = "Fitzpatrick",
                                    prefix_sep = "-", drop_first = True, dtype = int)
patientsNoId = patientsNoId.drop('FitzpatrickSkinPhotoType', axis = 1)
patientsNoId = patientsNoId.join(fitzpatrick_onehot)

In [283]:
# TODO: Figure out preprocessing for State and Country columns
patientsNoId['State'] = pd.Categorical(patientsNoId['State'])
patientsNoId['Country'] = pd.Categorical(patientsNoId['Country'])

In [284]:
# Code to preprocess the SkinConditions column, tokenizing should be more recognizable to keras than regular text
## Delete if this idea does not work
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# Flatten entries into actual lists
patientsNoId['SkinConditions'] = patientsNoId['SkinConditions'].apply(lambda x: ' '.join(x))
# Tokenize the characters of the columns
tokenizer = Tokenizer()
tokenizer.fit_on_texts(patientsNoId['SkinConditions'])
# Create and pad sequences of the tokens
patientsNoId['SkinConditions'] = tokenizer.texts_to_sequences(patientsNoId['SkinConditions'])
max_length = max(len(seq) for seq in patientsNoId['SkinConditions'])
testVar = pad_sequences(patientsNoId['SkinConditions'], maxlen = max_length, padding = "post")
patientsNoId['SkinConditions'] = testVar.tolist()

In [287]:
# Drop ID column for preprocessing - ID should have no effect on prediction
# TODO: complete preprocessing for allergies columns to have valid output
allergiesNoId = allergies.drop('SFM Id', axis = 1)

In [288]:
# TODO: create a training/testing split for the data

In [289]:
# TODO: Refine layers/sequences to actually make the model once we get it loading in correctly
# classes should be total number of outputs (1 for each ingredient)
n_classes = 164687

model = Sequential()

# Input layer
model.add(Dense(128, input_shape=(8,), activation='relu'))

# Hidden layers
model.add(Dense(2048, activation = "relu"))
model.add(Dense(256, activation = "relu"))

# Output layer
model.add(Dense(n_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# TODO: Change hyperparameters of the model to train/test
import torch
torch_tensor = torch.tensor(patientsNoId.values)
history = model.fit(
   patientsNoId, allergiesNoId, 
   batch_size = 128, 
   epochs = 20, 
   verbose = 1
)

In [None]:
# Get the average prediction score across all epochs
score = model.evaluate(x_test, y_test, verbose = 0)
score #First number is loss, second is accuracy

In [None]:
# TODO: Save Keras model as separate file