# Predicting Patient Diagnosis from Natural Language Symptoms
## AAI-501 Team 3 Final Project

Team 3 Members:  Tyler Foreman, Christi Moncrief, Tewfik Istanbooly, Mayank Bhatt

Date:  August 14, 2023

GitHub Repository: https://github.com/t4ai/AAI-501-Team3

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import random
import statistics
import spacy
import pickle
from pprint import pprint
import statsmodels.api as sm
from scipy.stats import uniform

# Modeling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Ridge
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, make_scorer
from scipy.stats import randint

import tensorflow_hub as hub
import tensorflow.compat.v1 as tf
tf.disable_eager_execution()

## Exploratory Data Analysis
 
 - Load data into dataframe
 - Generate and review descriptive statistics of the dataset/variables
 - Plot visualization of data spread for each variable


In [None]:
# Load dataset
symptoms_disease_df = pd.read_csv('./Symptom2Disease.csv')
symptoms_disease_df.head()

In [None]:
# plot distribution of diagnoses
fig, ax = plt.subplots(figsize=(10, 6))
categories = symptoms_disease_df['label'].value_counts().index
counts = symptoms_disease_df['label'].value_counts().values
plt.bar(categories, counts, width=0.5)

# Add labels
plt.ylabel('Count',fontsize=14)
plt.xlabel('Diagnosis',fontsize=14)
plt.xticks(fontsize=10, rotation = 80)
plt.yticks(fontsize=10)

# Add Title
plt.title('Diagnosis Distribution',fontsize=12);
plt.show()

## Data Cleanup
-  Perform routine cleanup on data:
    - remove punctuation marks
    - convert to lowercase
    - remove numbers
    - remove whitespace
- Lemmatize the text
    - Normalize to base words

In [None]:
# remove punctuation marks
punctuation = '!"#$%&()*+-/:;<=>?@[\\]^_`{|}~'
symptoms_disease_df['text'] = symptoms_disease_df['text'].apply(lambda x: ''.join(ch for ch in x if ch not in set(punctuation)))

# convert text to lowercase
symptoms_disease_df['text'] = symptoms_disease_df['text'].str.lower()

# remove numbers
symptoms_disease_df['text'] = symptoms_disease_df['text'].str.replace("[0-9]", " ")

# remove whitespaces
symptoms_disease_df['text'] = symptoms_disease_df['text'].apply(lambda x:' '.join(x.split()))

In [None]:
# Lemmatize to normalize words - use only for Embeddings below
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# function to lemmatize symptoms text
def lemmatization(symptoms):
    output = []
    for i in symptoms:
        s = [token.lemma_ for token in nlp(i)]
        output.append(' '.join(s))
    return output

## Data Preparation
-  Split data into test/train/validate datasets (80/10/10)
-  Create 3 datasets for experimentation:
    1.  Vectorize natrual language text using TFIDF
    2.  Create embeddings using Word2Vec (older approach)
    3.  Create embeddings using ELMo (Embeddings from Language Models)
- For each of the above, ensure no data leakage by separating train/test
    


In [None]:
# extract symptom description text to df X (features)
X = symptoms_disease_df['text'].copy()
X.head(10)

# extract diagnosis into df for y (labels)
y = symptoms_disease_df['label'].copy()


In [None]:
## TODO: Need to encode y into numeric values

In [None]:
# Split dataset into train, validate, test
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42)

### Prepare TF-IDF Vectorized datasets

In [None]:
# Start with count vecotrizer to build vocabulary - fit on train data first
count_vectorizer = CountVectorizer()

# Vectorize training data to create bag of words - fit the vectorizor on the training set only to avoid data leakage
X_train_count = count_vectorizer.fit_transform(X_train)
X_train_count.shape

# Vectorize test and validation data
X_val_count = count_vectorizer.transform(X_val)
X_test_count = count_vectorizer.transform(X_test)

# Fit tfidf vectorizer on training count only to avoid data leakage
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_count)

# Vectorize training, val, test data to TFIDF
X_train_tfidf = tf_transformer.transform(X_train_count)
X_val_tfidf = tf_transformer.transform(X_val_count)
X_test_tfidf = tf_transformer.transform(X_test_count)


### Prepare Word2Vec Embeddings datasets

### Setup ELMo Embeddings

In [None]:
# Load pre trained ELMo model
elmo = hub.Module("https://tfhub.dev/google/elmo/3", trainable=True)

In [None]:
# helper functions for ELMo
def elmo_embeddings(x):
  embeddings = elmo(x, signature="default", as_dict=True)["elmo"]

  with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    sess.run(tf.tables_initializer())
    # return average of ELMo features
    return sess.run(tf.reduce_mean(embeddings,1))
  
def get_elmo_batches(dataset, batch_size):
  return [dataset[i:i+batch_size] for i in range(0,len(dataset),batch_size)]

def save_embedding(embedding, file_name):
  # save elmo_train_new
  pickle_out = open(file_name,"wb")
  pickle.dump(embedding, pickle_out)
  pickle_out.close()

def load_saved_embedding(file_name):
  pickle_in = open(file_name, "rb")
  return pickle.load(pickle_in)

In [None]:
# Copy datasets for elmo
X_train_elmo = X_train.copy()
X_val_elmo = X_val.copy()
X_test_elmo = X_test.copy()

# Lemmatize the datasets
X_train_elmo = lemmatization(X_train_elmo)
X_val_elmo = lemmatization(X_val_elmo)
X_test_elmo = lemmatization(X_test_elmo)


In [None]:
# get batches for each dataset (to not overwhelm compute)
elmo_train_list = get_elmo_batches(X_train_elmo, 100)
elmo_val_list = get_elmo_batches(X_val_elmo, 100)
elmo_test_list = get_elmo_batches(X_test_elmo, 100)

In [None]:
# Extract ELMo embeddings
elmo_train = [elmo_embeddings(x) for x in elmo_train_list]
elmo_val = [elmo_embeddings(x) for x in elmo_val_list]
elmo_test = [elmo_embeddings(x) for x in elmo_test_list]

In [None]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_val_new = np.concatenate(elmo_val, axis = 0)
elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [None]:
# save embeddings 
save_embedding(elmo_train_new, "elmo_train_1690610969.592564106.pickle")
save_embedding(elmo_val_new, "elmo_val_1690610969.592564106.pickle")
save_embedding(elmo_test_new, "elmo_test_1690610969.592564106.pickle")

## Model Selection
-  Identify 2 models to conduct experiements with (ie: NBC and ---)
-  For each model:
    -  Train the model on each experimental dataset
    -  Validate against validation dataset
    -  Tune hyperparameters as necessary to optimize performance
    -  Repeat until optimized
    -  Test against test dataset
    -  Measure model performance
- Compare model performance

### Experiment 3: Train Classifier on ELMo Embeddings

In [None]:
# initialize and fit model on ELMo embeddings --- NBC may not work here
clf_elmo = Ridge()
clf_elmo.fit(elmo_train_new, y_train)