# Install requirements

In [1]:
%pip install scikit-learn pandas joblib graphviz

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Load machine learning Library

In [2]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load Dataset

In [3]:
df = pd.read_csv('dataset/dataset.csv')

# check columns names

In [4]:
print(df.columns)


Index(['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17'],
      dtype='object')


# Extract the symptoms columns

In [5]:
symptom_cols = [col for col in df.columns if col.startswith('Symptom')]
print(symptom_cols)

['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17']


# copy all symptoms names to sypmtom_list

In [6]:
df['symptom_list'] = df[symptom_cols].values.tolist()
print(df['symptom_list'].head())

0    [itching,  skin_rash,  nodal_skin_eruptions,  ...
1    [ skin_rash,  nodal_skin_eruptions,  dischromi...
2    [itching,  nodal_skin_eruptions,  dischromic _...
3    [itching,  skin_rash,  dischromic _patches, na...
4    [itching,  skin_rash,  nodal_skin_eruptions, n...
Name: symptom_list, dtype: object


In [7]:
df['symptom_list'] = df['symptom_list'].apply(lambda lst: [str(sym).strip() for sym in lst if isinstance(sym, str) and sym.strip() != ''])
print(df['symptom_list'])


0       [itching, skin_rash, nodal_skin_eruptions, dis...
1       [skin_rash, nodal_skin_eruptions, dischromic _...
2       [itching, nodal_skin_eruptions, dischromic _pa...
3               [itching, skin_rash, dischromic _patches]
4              [itching, skin_rash, nodal_skin_eruptions]
                              ...                        
4915    [vomiting, headache, nausea, spinning_movement...
4916    [skin_rash, pus_filled_pimples, blackheads, sc...
4917    [burning_micturition, bladder_discomfort, foul...
4918    [skin_rash, joint_pain, skin_peeling, silver_l...
4919    [skin_rash, high_fever, blister, red_sore_arou...
Name: symptom_list, Length: 4920, dtype: object


# initialize multilabel Binarizer, because we have multiple symptoms for one disease

In [8]:
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['symptom_list'])
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [None]:
# To know the shape of label matrix

In [9]:
print(f"Features shape: {X.shape}")

Features shape: (4920, 131)


# Encode target values (Diseases)

In [10]:
le = LabelEncoder()
y = le.fit_transform(df['Disease'])

# we have 41 diseases, print diseses classes

In [11]:
print(f"Classes: {list(le.classes_)}")
print(len(le.classes_))

Classes: ['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne', 'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Bronchial Asthma', 'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis', 'Common Cold', 'Dengue', 'Diabetes ', 'Dimorphic hemmorhoids(piles)', 'Drug Reaction', 'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Hypertension ', 'Hyperthyroidism', 'Hypoglycemia', 'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine', 'Osteoarthristis', 'Paralysis (brain hemorrhage)', 'Peptic ulcer diseae', 'Pneumonia', 'Psoriasis', 'Tuberculosis', 'Typhoid', 'Urinary tract infection', 'Varicose veins', 'hepatitis A']
41


# split 80% dataset for training, 20% for testing

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
clf = RandomForestClassifier(n_estimators=100,max_depth=10,min_samples_split=5, random_state=42)
clf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [14]:
y_pred = clf.predict(X_test)


print(y_test==y_pred)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  T

In [None]:
import joblib

# Save the model and label encoders for later use
joblib.dump(clf, 'random_forest_disease_model.pkl')
joblib.dump(le, 'label_encoder.pkl')
joblib.dump(mlb, 'symptom_binarizer.pkl')

['symptom_binarizer.pkl']

In [None]:
# for web app, load pre-trained model and encoders

clf = joblib.load('random_forest_disease_model.pkl')
le = joblib.load('label_encoder.pkl')
mlb = joblib.load('symptom_binarizer.pkl')

In [None]:
# prediction
input = ["headache","chest_pain","dizziness","lack_of_concentration"] #symptoms
new_X = mlb.transform([input]) # one-hot encode the symptoms
print(new_X) # printing for  just debugging
pred_label_num = clf.predict(new_X)[0] # Prediction phase, Get the predicted label number

# Decode label back to disease name
pred_disease = le.inverse_transform([pred_label_num])[0]
print("Predicted disease:", pred_disease)

# Get the probabilities of each class
_proba = clf.predict_proba(new_X)
print(_proba)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Predicted disease: Hypertension 
[[0.01756239 0.02275366 0.02589969 0.01917915 0.02446764 0.01614077
  0.02339388 0.02068144 0.01273732 0.02334356 0.00845739 0.01213304
  0.01127265 0.01909884 0.02520999 0.02197918 0.026668   0.02479616
  0.0323629  0.0142765  0.02163062 0.02181508 0.01284129 0.23350462
  0.01321768 0.01777256 0.00568587 0.01911115 0.02401829 0.0256751
  0.02226199 0.01510631 0.02340606 0.01630616 0.02228459 0.01638051
  0.01571212 0.01708158 0.02425217 0.01401132 0.01551079]]


In [None]:
# useless code, just for debugging
print(clf.predict(new_X)[0])
le.inverse_transform([23])

23


array(['Hypertension '], dtype=object)

# Extra visualization


In [19]:
from sklearn.tree import export_graphviz
import graphviz
# warning
# it worked well in linux , but failed in my windows

tree = clf.estimators_[0]


dot_data = export_graphviz(
    tree,
    out_file=None,  
    feature_names=X_train.columns if hasattr(X_train, 'columns') else None,
    class_names=[str(cls) for cls in clf.classes_],
    filled=True,
    rounded=True,
    special_characters=True
)


graph = graphviz.Source(dot_data)
graph.render("tree_0")  
graph.view()            


ExecutableNotFound: failed to execute WindowsPath('dot'), make sure the Graphviz executables are on your systems' PATH

In [None]:
import pandas as pd

df = pd.read_csv('dataset/dataset.csv')
symptom_list = []
symptom_cols = [col for col in df.columns if col.startswith('Symptom')]

for col in symptom_cols:
    for sym in df[col]:
        sym_clean = str(sym).strip()  
        if sym_clean and sym_clean not in symptom_list:
            symptom_list.append(sym_clean)

print(symptom_list)
print(len(symptom_list))


['itching', 'skin_rash', 'continuous_sneezing', 'shivering', 'stomach_pain', 'acidity', 'vomiting', 'indigestion', 'muscle_wasting', 'patches_in_throat', 'fatigue', 'weight_loss', 'sunken_eyes', 'cough', 'headache', 'chest_pain', 'back_pain', 'weakness_in_limbs', 'chills', 'joint_pain', 'yellowish_skin', 'constipation', 'pain_during_bowel_movements', 'breathlessness', 'cramps', 'weight_gain', 'mood_swings', 'neck_pain', 'muscle_weakness', 'stiff_neck', 'pus_filled_pimples', 'burning_micturition', 'bladder_discomfort', 'high_fever', 'nodal_skin_eruptions', 'ulcers_on_tongue', 'loss_of_appetite', 'restlessness', 'dehydration', 'dizziness', 'weakness_of_one_body_side', 'lethargy', 'nausea', 'abdominal_pain', 'pain_in_anal_region', 'sweating', 'bruising', 'cold_hands_and_feets', 'anxiety', 'knee_pain', 'swelling_joints', 'blackheads', 'foul_smell_of urine', 'skin_peeling', 'blister', 'dischromic _patches', 'watering_from_eyes', 'extra_marital_contacts', 'diarrhoea', 'loss_of_balance', 'blu