# Predicting Patient Diagnosis from Natural Language Symptoms
## AAI-501 Team 3 Final Project

Team 3 Members:  Tyler Foreman, Christi Moncrief, Tewfik Istanbooly, Mayank Bhatt

Date:  August 14, 2023

GitHub Repository: https://github.com/t4ai/AAI-501-Team3

In [None]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import random
import statistics
from pprint import pprint
import statsmodels.api as sm
from scipy.stats import uniform

# Modeling
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, recall_score, ConfusionMatrixDisplay, make_scorer
from scipy.stats import randint

## Exploratory Data Analysis
 
 - Load data into dataframe
 - Generate and review descriptive statistics of the dataset/variables
 - Plot visualization of data spread for each variable


In [None]:
# Load dataset
symptoms_disease_df = pd.read_csv('./Symptom2Disease.csv')
symptoms_disease_df.head()

In [None]:
# plot distribution of diagnoses
fig, ax = plt.subplots(figsize=(10, 6))
categories = symptoms_disease_df['label'].value_counts().index
counts = symptoms_disease_df['label'].value_counts().values
plt.bar(categories, counts, width=0.5)

# Add labels
plt.ylabel('Count',fontsize=14)
plt.xlabel('Diagnosis',fontsize=14)
plt.xticks(fontsize=10, rotation = 80)
plt.yticks(fontsize=10)

# Add Title
plt.title('Diagnosis Distribution',fontsize=12);
plt.show()

## Data Preparation
-  Split data into test/train/validate datasets (80/10/10)
-  Vectorize natrual language text
    -  Create experimental datasets with different vectorization approaches - Bag of Words, TFIDF


In [None]:
# extract symptom description text to df X (features)
X = symptoms_disease_df['text'].copy()
X.head(10)

# extract diagnosis into df for y (labels)
y = symptoms_disease_df['label'].copy()


In [None]:
# Split dataset into train, validate, test
X_train, X_test_val, y_train, y_test_val = train_test_split(X, y, test_size=0.2, random_state=42)
X_test, X_val, y_test, y_val = train_test_split(X_test_val, y_test_val, test_size=0.5, random_state=42)

### Setup Bag of Words Vectorizor


In [None]:
# Start with count tokenizer to build vocabulary - fit on train data first
count_vectorizor = CountVectorizer()

# Tokenize training data to create bag of words - fit the vectorizor on the training set only to avoid data leakage
X_train_count = count_vectorizor.fit_transform(X_train)
X_train_count.shape

# Tokenize test and validation data
X_val_count = count_vectorizor.transform(X_val)
X_test_count = count_vectorizor.transform(X_test)


### Setup TF-IDF Vectorizor

In [None]:
# Fit tfidf vectorizor on training count only to avoid data leakage
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_count)

# Vectorize training, val, test data
X_train_tfidf = tf_transformer.transform(X_train_count)
X_val_tfidf = tf_transformer.transform(X_train_count)
X_test_tfidf = tf_transformer.transform(X_train_count)

## Model Selection
-  Identify 2 models to conduct experiements with (ie: NBC and ---)
-  For each model:
    -  Train the model on each experimental dataset
    -  Validate against validation dataset
    -  Tune hyperparameters as necessary to optimize performance
    -  Repeat until optimized
    -  Test against test dataset
    -  Measure model performance
- Compare model performance