In [467]:
import pandas as pd
import numpy as np
import os

symptom_precaution = {row[0].strip(): row[1:] for row in pd.read_csv(os.path.join("datasets", "disease symptom prediction", "symptom_precaution.csv")).to_numpy()}
symptom_desc = {row[0].strip(): row[1] for row in pd.read_csv(os.path.join("datasets", "disease symptom prediction", "symptom_Description.csv")).to_numpy()}
symptoms = pd.read_csv(os.path.join("datasets", "disease symptom prediction", "Symptom-severity.csv")).to_numpy()

In [473]:
# Gets a set of all the inflictions
data = pd.read_csv(os.path.join("datasets", "disease symptom prediction", "dataset.csv"))
arr = data.to_numpy()

allInflictions = set()
for row in arr:
    row = row[1:]
    for item in row:
        if type(item) is str and item not in allInflictions:
            allInflictions.add(item.strip())

In [469]:
# associations every infliction with the column in the data set
counter = 0
indexer = {}
for item in allInflictions:
    indexer[item] = counter
    counter += 1

In [471]:
# converts list of symptoms to be array of 0 and 1s
structured = [] # 2D array of data [[0s/1s], [diseasename]]
for row in arr:
    infliction = row[0].strip()
    row = row[1:]
    newRow = [0 for i in range(counter)]
    for symptom in row:
        if type(symptom) is str:
            newRow[indexer[symptom.strip()]] = 1
    structured.append([newRow, infliction])

df = pd.DataFrame(structured)
df

Unnamed: 0,0,1
0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Healthy
1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Healthy
2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Healthy
3,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Healthy
4,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Healthy
...,...,...
4923,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",(vertigo) Paroymsal Positional Vertigo
4924,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",Acne
4925,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Urinary tract infection
4926,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",Psoriasis


In [415]:
ordered_list = sorted((value, key) for key, value in indexer.items())
ordered_words_only = [item[1].strip() for item in ordered_list]

In [472]:
# maps the illnesses to a unique integer to use for learning
learning_data = df.to_numpy()

unique_int_to_illness = {k: v for k, v in enumerate(symptom_desc.keys())}
illness_to_unique_int = {v: k for k, v in unique_int_to_illness.items()}
map_values = np.vectorize(lambda x : illness_to_unique_int[x])

learning_data[:, 1] = map_values(learning_data[:, 1])

In [417]:
X = list(learning_data[:, 0])
Y = list(learning_data[:, 1])

In [452]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33)

In [419]:
# Or use all of data for training
# Xtrain = X
# Ytrain = Y

In [453]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import zero_one_loss

logistic_classifier = LogisticRegression().fit(Xtrain, Ytrain)
print(f"Training error rate: {zero_one_loss(Ytrain, logistic_classifier.predict(Xtrain))}\
      \nValidation error rate: {zero_one_loss(Ytest, logistic_classifier.predict(Xtest))}")


Training error rate: 0.0      
Validation error rate: 0.0


In [454]:
from sklearn.tree import DecisionTreeClassifier
decisiontree_classifier = DecisionTreeClassifier().fit(Xtrain, Ytrain)
print(f"Training error rate: {zero_one_loss(Ytrain, decisiontree_classifier.predict(Xtrain))}\
      \nValidation error rate: {zero_one_loss(Ytest, decisiontree_classifier.predict(Xtest))}")

Training error rate: 0.0      
Validation error rate: 0.003687768899815591


In [455]:
from sklearn.ensemble import RandomForestClassifier

randomforest_classifier = RandomForestClassifier().fit(Xtrain, Ytrain)
print(f"Training error rate: {zero_one_loss(Ytrain, randomforest_classifier.predict(Xtrain))}\
      \nValidation error rate: {zero_one_loss(Ytest, randomforest_classifier.predict(Xtest))}")

Training error rate: 0.0      
Validation error rate: 0.0


In [474]:
# Generate n models and take the average of their error
sum_logreg_train = 0
sum_dectree_train = 0
sum_ranfor_train = 0
sum_logreg_test = 0
sum_dectree_test = 0
sum_ranfor_test = 0

n = 10

for _ in range(n):
    print(f"Performing iteration {_}")
    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.33)

    logistic_classifier = LogisticRegression().fit(Xtrain, Ytrain)
    sum_logreg_train += zero_one_loss(Ytrain, logistic_classifier.predict(Xtrain))
    sum_logreg_test += zero_one_loss(Ytest, decisiontree_classifier.predict(Xtest))

    decisiontree_classifier = DecisionTreeClassifier().fit(Xtrain, Ytrain)
    sum_dectree_train += zero_one_loss(Ytrain, decisiontree_classifier.predict(Xtrain))
    sum_dectree_test += zero_one_loss(Ytest, decisiontree_classifier.predict(Xtest))

    randomforest_classifier = RandomForestClassifier().fit(Xtrain, Ytrain)
    sum_ranfor_train += zero_one_loss(Ytrain, randomforest_classifier.predict(Xtrain))
    sum_ranfor_test += zero_one_loss(Ytest, randomforest_classifier.predict(Xtest))

print(f"LogReg Training Error: {sum_logreg_train / 1000}\nLogReg Testing Error: {sum_logreg_test / 1000}\n\n")
print(f"DecTree Training Error: {sum_dectree_train / 1000}\nDecTree Testing Error: {sum_dectree_test / 1000}\n\n")
print(f"RanFor Training Error: {sum_ranfor_train / 1000}\nRanFor Testing Error: {sum_ranfor_test / 1000}")

Performing iteration 0
Performing iteration 1
Performing iteration 2
Performing iteration 3
Performing iteration 4
Performing iteration 5
Performing iteration 6
Performing iteration 7
Performing iteration 8
Performing iteration 9
LogReg Training Error: 0.0
LogReg Testing Error: 4.917025199754121e-06


DecTree Training Error: 0.0
DecTree Testing Error: 1.1063306699446773e-05


RanFor Training Error: 0.0
RanFor Testing Error: 3.687768899815591e-06


In [None]:
import ipywidgets as widgets
from IPython.display import display

input_symptoms = [0] * len(indexer)

# Sample options for checkboxes
options = sorted(list(indexer.keys()))

# Number of options per line
options_per_line = 5

# Calculate the number of rows needed
num_rows = len(options) // options_per_line + (len(options) % options_per_line > 0)

# Create a list to store HBoxes for each row
checkbox_rows = []

# Iterate over rows and create an HBox for each row
for i in range(num_rows):
    start_index = i * options_per_line
    end_index = (i + 1) * options_per_line
    checkboxes_in_row = [widgets.Checkbox(value=False, description=option) for option in options[start_index:end_index]]
    row_layout = widgets.HBox(checkboxes_in_row)
    checkbox_rows.append(row_layout)

# Display VBox to arrange rows vertically
checkboxes_layout = widgets.VBox(checkbox_rows)
display(checkboxes_layout)

# Function to handle checkbox value changes
def on_checkbox_change(change):
    checkbox = change['owner']
    checkbox_value = change['new']
    input_symptoms[indexer[checkbox.description]] = 1 if checkbox_value else 0

# Attach the function to each checkbox's value change event
for row_layout in checkbox_rows:
    for checkbox in row_layout.children:
        checkbox.observe(on_checkbox_change, names='value')

VBox(children=(HBox(children=(Checkbox(value=False, description='abdominal_pain'), Checkbox(value=False, descr…

In [468]:
for classifier in [logistic_classifier, decisiontree_classifier, randomforest_classifier]:
    pred = unique_int_to_illness[classifier.predict([input_symptoms])[0]]
    print(f"{type(classifier).__name__}: {pred}")
    print(symptom_desc[pred])
    print("Precautions to take:")
    for prec in symptom_precaution[pred]:
        # check for nan value
        if prec == prec:
            print(f"\t- {prec}")
    print("\n")

LogisticRegression: Diabetes
Diabetes is a disease that occurs when your blood glucose, also called blood sugar, is too high. Blood glucose is your main source of energy and comes from the food you eat. Insulin, a hormone made by the pancreas, helps glucose from food get into your cells to be used for energy.
Precautions to take:
	- have balanced diet
	- exercise
	- consult doctor
	- follow up


DecisionTreeClassifier: Diabetes
Diabetes is a disease that occurs when your blood glucose, also called blood sugar, is too high. Blood glucose is your main source of energy and comes from the food you eat. Insulin, a hormone made by the pancreas, helps glucose from food get into your cells to be used for energy.
Precautions to take:
	- have balanced diet
	- exercise
	- consult doctor
	- follow up


RandomForestClassifier: Diabetes
Diabetes is a disease that occurs when your blood glucose, also called blood sugar, is too high. Blood glucose is your main source of energy and comes from the food 