# BME 231 Final Project - AI Diagnosis Assistant
**Name: Yerin Kang, Hajin Ruy**

In [234]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Data Exploration

In [235]:
# import the data
df = pd.read_csv('data/Disease_symptom_and_patient_profile_dataset.csv')

df.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive


In [236]:
# explore the data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 349 entries, 0 to 348
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Disease               349 non-null    object
 1   Fever                 349 non-null    object
 2   Cough                 349 non-null    object
 3   Fatigue               349 non-null    object
 4   Difficulty Breathing  349 non-null    object
 5   Age                   349 non-null    int64 
 6   Gender                349 non-null    object
 7   Blood Pressure        349 non-null    object
 8   Cholesterol Level     349 non-null    object
 9   Outcome Variable      349 non-null    object
dtypes: int64(1), object(9)
memory usage: 27.4+ KB


In [237]:
print("shape:", df.shape)
print("columns:", df.columns)
print("dtypes:\n", df.dtypes) # 1 numeric feature, 9 categorical features (in)

shape: (349, 10)
columns: Index(['Disease', 'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Age',
       'Gender', 'Blood Pressure', 'Cholesterol Level', 'Outcome Variable'],
      dtype='object')
dtypes:
 Disease                 object
Fever                   object
Cough                   object
Fatigue                 object
Difficulty Breathing    object
Age                      int64
Gender                  object
Blood Pressure          object
Cholesterol Level       object
Outcome Variable        object
dtype: object


## Data Cleaning

In [238]:
# check for missing values
df.isnull().sum() # no missing values found

Disease                 0
Fever                   0
Cough                   0
Fatigue                 0
Difficulty Breathing    0
Age                     0
Gender                  0
Blood Pressure          0
Cholesterol Level       0
Outcome Variable        0
dtype: int64

In [239]:
# Filter positive cases
df = df[df['Outcome Variable'] == 'Positive'] # we only want positive cases for this project as we are interested in prediciting the disease
df = df.drop(columns=['Outcome Variable']) # drop the outcome variable column after filtering
print("new shape after filtering positive cases:", df.shape)

new shape after filtering positive cases: (186, 9)


In [240]:
# check for duplicates
duplicates = df.duplicated().sum()
print("number of duplciates:", duplicates)

# drop duplicates if any
df.drop_duplicates(inplace=True)
print("new shape after dropping duplicates:", df.shape)

number of duplciates: 29
new shape after dropping duplicates: (157, 9)


In [241]:
# exclude outliers for numerical features (which is Age in this case)
Q1, Q3 = df['Age'].quantile(0.25), df['Age'].quantile(0.75)
IQR = Q3 - Q1

# Identify outliers
outliers = df[(df['Age'] < Q1 - 1.5 * IQR) | (df['Age'] > Q3 + 1.5 * IQR)]
print("number of outliers", len(outliers)) # 1 outlier found

# remove outliers
df_cleaned = df[(df['Age'] >= Q1 - 1.5 * IQR) & (df['Age'] <= Q3 + 1.5 * IQR)]
print("new shape after removing outliers:", df_cleaned.shape)

number of outliers 1
new shape after removing outliers: (156, 9)


## Feature Selection and Splitting Data

In [None]:
# set feature and target variables
X = df.drop(columns=['Disease'])
y = df['Disease']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)