In [47]:
# Import Dependencies
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import pickle

In [48]:
# Read Raw Dataset
df = pd.read_excel('./dataset/raw_data.xlsx')

In [49]:
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [50]:
# Fill all NaN with the values above
data = df.fillna(method='ffill')

In [51]:
data.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall


In [52]:
# Process Disease and Symptom Names
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    return data_list

In [53]:
# Data Cleanup
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

In [54]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list,
            {'hypertensive disease': ['pain chest',
              'shortness of breath',
              'dizziness',
              'asthenia',
              'fall',
              'syncope',
              'vertigo',
              'sweat',
              'sweating increased',
              'palpitation',
              'nausea',
              'angina pectoris',
              'pressure chest'],
             'diabetes': ['polyuria',
              'polydypsia',
              'shortness of breath',
              'pain chest',
              'asthenia',
              'nausea',
              'orthopnea',
              'rale',
              'sweat',
              'sweating increased',
              'unresponsiveness',
              'mental status changes',
              'vertigo',
              'vomiting',
              'labored breathing'],
             'pneumonia': ['cough',
              'fever',
              'decreased translucency',
              'shortness of breath',
     

In [55]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{'hypertensive disease': 3363.0,
 'diabetes': 1421.0,
 'pneumonia': 1029.0,
 'failure heart congestive': 963.0,
 'accident\xa0cerebrovascular': 885.0,
 'infection': 630.0,
 'chronic obstructive airway disease': 524.0,
 'deep vein thrombosis': 310.0,
 'carcinoma': 269.0,
 'hepatitis C': 269.0,
 'kidney failure acute': 186.0,
 'osteoporosis': 169.0,
 'transient ischemic attack': 168.0,
 'malignant neoplasm of breast': 152.0,
 'carcinoma breast': 152.0,
 'failure heart': 138.0,
 'upper respiratory infection': 135.0,
 'hepatitis': 133.0,
 'hypertension pulmonary': 128.0,
 'hepatitis B': 111.0,
 'parkinson disease': 108.0,
 "Alzheimer's disease": 101.0,
 'kidney disease': 96.0,
 'malignant tumor of colon': 94.0,
 'carcinoma colon': 94.0,
 'respiratory failure': 90.0,
 'manic disorder': 85.0,
 'obesity morbid': 76.0,
 'chronic alcoholic intoxication': 70.0}

In [56]:
# Save cleaned data as CSV
f = open('./dataset/cleaned_data.csv', 'w')

with f:
    writer = csv.writer(f)
    for key, val in disease_symptom_dict.items():
        for i in range(len(val)):
            writer.writerow([key, val[i], disease_symptom_count[key]])

In [57]:
# Read Cleaned Data as DF
df = pd.read_csv('dataset/cleaned_data.csv',encoding='latin1')
df.columns = ['disease', 'symptom', 'occurence_count']
df.head()

Unnamed: 0,disease,symptom,occurence_count
0,hypertensive disease,shortness of breath,3363.0
1,hypertensive disease,dizziness,3363.0
2,hypertensive disease,asthenia,3363.0
3,hypertensive disease,fall,3363.0
4,hypertensive disease,syncope,3363.0


In [58]:
# Remove any rows with empty values
df.replace(float('nan'), np.nan, inplace=True)
df.dropna(inplace=True)

In [59]:
from sklearn import preprocessing

In [60]:
n_unique = len(df['symptom'].unique())
n_unique

197

In [61]:
df.dtypes

disease             object
symptom             object
occurence_count    float64
dtype: object

In [62]:
# Encode the Labels
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['symptom'])
print(integer_encoded)

[155  38  11  54 173 189 169 170 128 111   6 139 137 135 155 126  11 111
 121 144 169 170 184 106 189 191  97  28  59  33 155 144 140 134 196  18
  23 150  65 117 193  67  37 175 103 114 155 121  96 144  43  28 193  42
  11 163  53  77 184 153 118  59  50  33  79  23 143  34   2 171 124   8
  28 155 193  28  43  37 165  92  84 132  22 171 124  45 155 127  21 144
  50  90  24 117 147 105 124  99 181 176  32  82  62 107 117  27 183 131
  64  34 162  72 111 101   9  36  57  28   4 102  73  69  23  10 132  86
  90  88 120  78  11  91  34  68 184 142  54  80 159  36 189 119  17 124
 173 123   8 185 100 163  42  53  11 112 189 118 100  52   0 190  71 151
 173  35 145  25 104 105 131 149  50  35  99  51  20  43 171  60 104 105
 131 149  50  35  99  51  20  43 171  60 121  55  44  43 155 130  96 174
  90   6  89  28 177 193 155  97  59 167 117 161  94  46  12 136 134 114
  43 140  33 150 146   8 116 110  48  61  30 172  78   9 164 175 125 143
   7 178 141  23  37  59 191 155   0  74  18 112  4

In [63]:
# One Hot Encode the Labels
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [64]:
onehot_encoded[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [65]:
len(onehot_encoded[0])

197

In [66]:
cols = np.asarray(df['symptom'].unique())
cols

array(['shortness of breath', 'dizziness', 'asthenia', 'fall', 'syncope',
       'vertigo', 'sweat', 'sweating increased', 'palpitation', 'nausea',
       'angina pectoris', 'pressure chest', 'polyuria', 'polydypsia',
       'pain chest', 'orthopnea', 'rale', 'unresponsiveness',
       'mental status changes', 'vomiting', 'labored breathing', 'cough',
       'fever', 'decreased translucency', 'productive cough',
       'pleuritic pain', 'yellow sputum', 'breath sounds decreased',
       'chill', 'rhonchus', 'green sputum', 'non-productive cough',
       'wheezing', 'haemoptysis', 'distress respiratory', 'tachypnea',
       'malaise', 'night sweat', 'jugular venous distention', 'dyspnea',
       'dysarthria', 'speech slurred', 'facial paresis', 'hemiplegia',
       'seizure', 'numbness', 'erythema', 'hepatosplenomegaly',
       'pruritus', 'diarrhea', 'abscess bacterial', 'swelling', 'pain',
       'apyrexial', 'sputum purulent', 'hypoxemia', 'hypercapnia',
       'patient non complianc

In [67]:
# Create a new dataframe to save OHE labels
df_ohe = pd.DataFrame(columns = cols)
df_ohe.head()

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,hematocrit decreased,renal angle tenderness,feels hot/feverish,hoard,irritable mood,neologism,homelessness,sleeplessness,unconscious state,panic


In [68]:
for i in range(len(onehot_encoded)):
    df_ohe.loc[i] = onehot_encoded[i]

In [69]:
df_ohe.head()

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,hematocrit decreased,renal angle tenderness,feels hot/feverish,hoard,irritable mood,neologism,homelessness,sleeplessness,unconscious state,panic
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [70]:
len(df_ohe)

389

In [71]:
# Disease Dataframe
df_disease = df['disease']
df_disease.head()

0    hypertensive disease
1    hypertensive disease
2    hypertensive disease
3    hypertensive disease
4    hypertensive disease
Name: disease, dtype: object

In [72]:
# Concatenate OHE Labels with the Disease Column
df_concat = pd.concat([df_disease,df_ohe], axis=1)
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,hematocrit decreased,renal angle tenderness,feels hot/feverish,hoard,irritable mood,neologism,homelessness,sleeplessness,unconscious state,panic
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [73]:
df_concat.drop_duplicates(keep='first',inplace=True)

In [74]:
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,hematocrit decreased,renal angle tenderness,feels hot/feverish,hoard,irritable mood,neologism,homelessness,sleeplessness,unconscious state,panic
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [75]:
len(df_concat)

386

In [76]:
cols = df_concat.columns
cols

Index(['disease', 'shortness of breath', 'dizziness', 'asthenia', 'fall',
       'syncope', 'vertigo', 'sweat', 'sweating increased', 'palpitation',
       ...
       'hematocrit decreased', 'renal angle tenderness', 'feels hot/feverish',
       'hoard', 'irritable mood', 'neologism', 'homelessness', 'sleeplessness',
       'unconscious state', 'panic'],
      dtype='object', length=198)

In [77]:
cols = cols[1:]

In [78]:
# Since, every disease has multiple symptoms, combine all symptoms per disease per row
df_concat = df_concat.groupby('disease').sum()
df_concat = df_concat.reset_index()
df_concat[:5]

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,hematocrit decreased,renal angle tenderness,feels hot/feverish,hoard,irritable mood,neologism,homelessness,sleeplessness,unconscious state,panic
0,Alzheimer's disease,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,accident cerebrovascular,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,carcinoma,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,carcinoma breast,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,carcinoma colon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [79]:
len(df_concat)

29

In [80]:
df_concat.to_csv("./dataset/training_dataset.csv", index=False)

In [81]:
# One Hot Encoded Features
X = df_concat[cols]

# Labels
y = df_concat['disease']

## Model Training

In [82]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [83]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [84]:
len(X_train), len(y_train)

(23, 23)

In [85]:
len(X_test), len(y_test)

(6, 6)

In [86]:
# Initialize the models
clf_dt = RandomForestClassifier(random_state = 1)
clf_dt.fit(X,y)

RandomForestClassifier(random_state=1)

In [87]:
#the accuracy
clf_dt.score(X, y)

0.9310344827586207

In [88]:
disease_pred = clf_dt.predict(X)
print(disease_pred[3])

carcinoma breast


In [89]:
disease_real = y.values
print(disease_real[3])

carcinoma breast


In [90]:
for i in range(0, len(disease_real)):
    if disease_pred[i]!=disease_real[i]:
        print ('Pred: {0}\nActual: {1}\n'.format(disease_pred[i], disease_real[i]))

Pred: malignant tumor of colon
Actual: carcinoma colon

Pred: carcinoma breast
Actual: malignant neoplasm of breast



pickling objects

In [91]:
pickle.dump(clf_dt, open('model.pkl','wb'))

Initialize flask

In [92]:
#import libraries
import numpy as np
from flask import Flask, render_template,request
import pickle#Initialize the flask App
app = Flask(__name__)
models = pickle.load(open('model.pkl', 'rb'))