In [1]:
# Import necessary modules
import pandas as pd
import numpy as np

# Visualisation
import matplotlib.pyplot as plt
import matplotlib as mpl

# For balancing data
from imblearn.over_sampling import SMOTE

In [2]:
diseases_df = pd.read_csv('diseases.csv')
diseases_df.head()

Unnamed: 0,fever,headache,nausea,vomiting,fatigue,joint_pain,skin_rash,cough,weight_loss,yellow_eyes,disease
0,1,0,1,0,1,0,0,1,1,0,Paralysis (brain hemorrhage)
1,0,0,1,0,0,0,1,1,0,0,Paralysis (brain hemorrhage)
2,0,0,1,0,1,0,1,1,0,0,Paralysis (brain hemorrhage)
3,0,0,1,1,1,0,1,1,1,1,Paralysis (brain hemorrhage)
4,1,0,1,0,1,0,0,0,1,1,Paralysis (brain hemorrhage)


In [3]:
X = diseases_df.drop(['disease'], axis=1)
X.head()

Unnamed: 0,fever,headache,nausea,vomiting,fatigue,joint_pain,skin_rash,cough,weight_loss,yellow_eyes
0,1,0,1,0,1,0,0,1,1,0
1,0,0,1,0,0,0,1,1,0,0
2,0,0,1,0,1,0,1,1,0,0
3,0,0,1,1,1,0,1,1,1,1
4,1,0,1,0,1,0,0,0,1,1


In [4]:
y = diseases_df['disease']
y.head()

0    Paralysis (brain hemorrhage)
1    Paralysis (brain hemorrhage)
2    Paralysis (brain hemorrhage)
3    Paralysis (brain hemorrhage)
4    Paralysis (brain hemorrhage)
Name: disease, dtype: object

In [5]:
oversample = SMOTE()
transformed_feature_df, transformed_label_df = oversample.fit_resample(X, y)

In [6]:
transformed_label_df.value_counts()

disease
Paralysis (brain hemorrhage)                      90
Hypertension                                      90
Hepatitis B                                       90
Impetigo                                          90
Chronic cholestasis                               90
Hepatitis C                                       90
Typhoid                                           90
Dimorphic hemorrhoids(piles)                      90
Vertigo (Benign paroxysmal Positional Vertigo)    90
Cervical spondylosis                              90
Tuberculosis                                      90
Hyperthyroidism                                   90
Malaria                                           90
Gastroenteritis                                   90
Osteoarthritis                                    90
Heart attack                                      90
Dengue                                            90
Pneumonia                                         90
Urinary tract infection               

In [7]:
transformed_feature_df.head()

Unnamed: 0,fever,headache,nausea,vomiting,fatigue,joint_pain,skin_rash,cough,weight_loss,yellow_eyes
0,1,0,1,0,1,0,0,1,1,0
1,0,0,1,0,0,0,1,1,0,0
2,0,0,1,0,1,0,1,1,0,0
3,0,0,1,1,1,0,1,1,1,1
4,1,0,1,0,1,0,0,0,1,1


In [8]:
# Split the data
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [9]:
X_train, X_test, y_train, y_test = train_test_split(transformed_feature_df, transformed_label_df, test_size=0.3)

In [10]:
# Linear SVC, KNeighborsClassifier, SVC, Ensemble Classifiers

In [11]:
C = 10
classifiers = {
    'Linear SVC': SVC(kernel='linear', C=C, probability=True, random_state=0),
    'KNN Classifier': KNeighborsClassifier(10),
    'SVC': SVC(),
    'RFST': RandomForestClassifier(n_estimators=100),
    'ADA': AdaBoostClassifier(n_estimators=100)
}

In [12]:
n_classifiers = len(classifiers)

for index, (name, classifier) in enumerate(classifiers.items()):
    classifier.fit(X_train, y_train)

    print(name)
    y_pred = classifier.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy for %s: %0.1f%% "% (name, accuracy * 100))
    print(classification_report(y_test, y_pred))

Linear SVC
Accuracy for Linear SVC: 48.7% 
                                                precision    recall  f1-score   support

                                          AIDS       0.73      0.53      0.62        30
                                          Acne       0.58      0.50      0.54        22
                           Alcoholic hepatitis       0.62      0.93      0.75        27
                                       Allergy       0.35      0.57      0.43        23
                                     Arthritis       0.40      0.40      0.40        30
                              Bronchial Asthma       0.42      0.58      0.49        31
                          Cervical spondylosis       0.47      0.65      0.55        23
                           Chronic cholestasis       0.40      0.40      0.40        30
                                        Dengue       0.41      0.54      0.47        26
                                      Diabetes       0.74      0.59      0.6

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [13]:
from collections import Counter

possible_symptoms = ['fever', 'headache', 'nausea', 'vomiting', 'fatigue', 'joint_pain', 'skin_rash', 'cough', 'weight_loss', 'yellow_eyes']
def predictDisease(symptoms):
    predictions = []
    input_data = [0] * 10
    
    for i, symptom in enumerate(symptoms.split(',')):
        ind = possible_symptoms.index(symptom)
        input_data[ind] = 1

    print(input_data)
    
    for index, (name, classifier) in enumerate(classifiers.items()):
        classifier.fit(X_train, y_train)
        sample = pd.DataFrame([input_data], columns=X_train.columns)
        prediction = classifier.predict(sample)
        predictions.append(prediction[0])

    print(predictions)
    
    disease = Counter(predictions).most_common(1)[0][0]
    print(f"Most likely disease: {disease}")
    

In [14]:
# 0	1	1	1	0	1	0	1	0	0 - hypoglycemia
# 0	0	0	0	0	0	1	0	1	1 - Dimorphic hemorrhoids(piles)
predictDisease('yellow_eyes,weight_loss,skin_rash')

[0, 0, 0, 0, 0, 0, 1, 0, 1, 1]
['Dimorphic hemorrhoids(piles)', 'Dimorphic hemorrhoids(piles)', 'Dimorphic hemorrhoids(piles)', 'Dimorphic hemorrhoids(piles)', 'Diabetes']
Most likely disease: Dimorphic hemorrhoids(piles)
