# Install requirements

In [20]:
%pip install scikit-learn pandas joblib graphviz

Collecting graphviz
  Obtaining dependency information for graphviz from https://files.pythonhosted.org/packages/91/4c/e0ce1ef95d4000ebc1c11801f9b944fa5910ecc15b5e351865763d8657f8/graphviz-0.21-py3-none-any.whl.metadata
  Downloading graphviz-0.21-py3-none-any.whl.metadata (12 kB)
Downloading graphviz-0.21-py3-none-any.whl (47 kB)
   ---------------------------------------- 0.0/47.3 kB ? eta -:--:--
   -------- ------------------------------- 10.2/47.3 kB ? eta -:--:--
   ----------------- ---------------------- 20.5/47.3 kB 330.3 kB/s eta 0:00:01
   ---------------------------------- ----- 41.0/47.3 kB 393.8 kB/s eta 0:00:01
   ---------------------------------------- 47.3/47.3 kB 336.9 kB/s eta 0:00:00
Installing collected packages: graphviz
Successfully installed graphviz-0.21
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


# Load machine learning Library

In [21]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Load Dataset

In [22]:
df = pd.read_csv('dataset/dataset.csv')

# check columns names

In [23]:
print(df.columns)


Index(['Disease', 'Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4',
       'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9',
       'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14',
       'Symptom_15', 'Symptom_16', 'Symptom_17'],
      dtype='object')


# Extract the symptoms columns

In [24]:
symptom_cols = [col for col in df.columns if col.startswith('Symptom')]
print(symptom_cols)

['Symptom_1', 'Symptom_2', 'Symptom_3', 'Symptom_4', 'Symptom_5', 'Symptom_6', 'Symptom_7', 'Symptom_8', 'Symptom_9', 'Symptom_10', 'Symptom_11', 'Symptom_12', 'Symptom_13', 'Symptom_14', 'Symptom_15', 'Symptom_16', 'Symptom_17']


# copy all symptoms names to sypmtom_list

In [25]:
df['symptom_list'] = df[symptom_cols].values.tolist()
print(df['symptom_list'].head())

0    [itching,  skin_rash,  nodal_skin_eruptions,  ...
1    [ skin_rash,  nodal_skin_eruptions,  dischromi...
2    [itching,  nodal_skin_eruptions,  dischromic _...
3    [itching,  skin_rash,  dischromic _patches, na...
4    [itching,  skin_rash,  nodal_skin_eruptions, n...
Name: symptom_list, dtype: object


In [26]:
df['symptom_list'] = df['symptom_list'].apply(lambda lst: [str(sym).strip() for sym in lst if isinstance(sym, str) and sym.strip() != ''])
print(df['symptom_list'])


0       [itching, skin_rash, nodal_skin_eruptions, dis...
1       [skin_rash, nodal_skin_eruptions, dischromic _...
2       [itching, nodal_skin_eruptions, dischromic _pa...
3               [itching, skin_rash, dischromic _patches]
4              [itching, skin_rash, nodal_skin_eruptions]
                              ...                        
4915    [vomiting, headache, nausea, spinning_movement...
4916    [skin_rash, pus_filled_pimples, blackheads, sc...
4917    [burning_micturition, bladder_discomfort, foul...
4918    [skin_rash, joint_pain, skin_peeling, silver_l...
4919    [skin_rash, high_fever, blister, red_sore_arou...
Name: symptom_list, Length: 4920, dtype: object


In [27]:
mlb = MultiLabelBinarizer()
X = mlb.fit_transform(df['symptom_list'])
print(X)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [28]:
print(f"Features shape: {X.shape}")

Features shape: (4920, 131)


In [29]:
le = LabelEncoder()
y = le.fit_transform(df['Disease'])

In [30]:
print(f"Classes: {list(le.classes_)}")
print(len(le.classes_))

Classes: ['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne', 'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Bronchial Asthma', 'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis', 'Common Cold', 'Dengue', 'Diabetes ', 'Dimorphic hemmorhoids(piles)', 'Drug Reaction', 'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Hypertension ', 'Hyperthyroidism', 'Hypoglycemia', 'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine', 'Osteoarthristis', 'Paralysis (brain hemorrhage)', 'Peptic ulcer diseae', 'Pneumonia', 'Psoriasis', 'Tuberculosis', 'Typhoid', 'Urinary tract infection', 'Varicose veins', 'hepatitis A']
41


In [31]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
clf = RandomForestClassifier(n_estimators=100,max_depth=10,min_samples_split=5, random_state=42)
clf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,10
,min_samples_split,5
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [33]:
y_pred = clf.predict(X_test)


print(y_test==y_pred)

[ True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  T

In [34]:
import joblib

# Save the model and label encoders
joblib.dump(clf, 'random_forest_disease_model.pkl')
joblib.dump(le, 'label_encoder.pkl')
joblib.dump(mlb, 'symptom_binarizer.pkl')

['symptom_binarizer.pkl']

In [35]:
# for web app

clf = joblib.load('random_forest_disease_model.pkl')
le = joblib.load('label_encoder.pkl')
mlb = joblib.load('symptom_binarizer.pkl')

In [36]:
# prediction
input = ["headache","chest_pain","dizziness","lack_of_concentration"] #symptoms
new_X = mlb.transform([input])
print(new_X)
pred_label_num = clf.predict(new_X)[0]

# Decode label back to disease name
pred_disease = le.inverse_transform([pred_label_num])[0]
print("Predicted disease:", pred_disease)
_proba = clf.predict_proba(new_X)
print(_proba)

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
  0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
Predicted disease: Hypertension 
[[0.01756239 0.02275366 0.02589969 0.01917915 0.02446764 0.01614077
  0.02339388 0.02068144 0.01273732 0.02334356 0.00845739 0.01213304
  0.01127265 0.01909884 0.02520999 0.02197918 0.026668   0.02479616
  0.0323629  0.0142765  0.02163062 0.02181508 0.01284129 0.23350462
  0.01321768 0.01777256 0.00568587 0.01911115 0.02401829 0.0256751
  0.02226199 0.01510631 0.02340606 0.01630616 0.02228459 0.01638051
  0.01571212 0.01708158 0.02425217 0.01401132 0.01551079]]


In [37]:
print(clf.predict(new_X)[0])
le.inverse_transform([23])

23


array(['Hypertension '], dtype=object)

# Extra visualization


In [None]:
from sklearn.tree import export_graphviz
import graphviz
# warning
# it worked well in linux , but failed in my windows

tree = clf.estimators_[0]


dot_data = export_graphviz(
    tree,
    out_file=None,  
    feature_names=X_train.columns if hasattr(X_train, 'columns') else None,
    class_names=[str(cls) for cls in clf.classes_],
    filled=True,
    rounded=True,
    special_characters=True
)


graph = graphviz.Source(dot_data)
graph.render("tree_0")  
graph.view()            


In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
from sklearn.model_selection import cross_val_score
scores = cross_val_score(clf, X, y, cv=5)  # 5-fold CV
print("Cross-validated accuracy:", scores.mean())