In [4]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
data = pd.read_csv('symptoms_data.csv')
data.fillna('No Symptom', inplace=True)
print(data.head())


      Disease        Symptom_1        Symptom_2             Symptom_3  \
0  Tick fever            Fever  Nasal Discharge              Lameness   
1  Tick fever            Fever         Lameness   Swollen Lymph nodes   
2  Tick fever            Fever  Nasal Discharge              Lethargy   
3  Tick fever            Fever  Nasal Discharge              Lameness   
4  Tick fever  Nasal Discharge      Weight Loss  Breathing Difficulty   

             Symptom_4                         Symptom_5  \
0             Lethargy  Increased drinking and urination   
1             Vomiting            Neurological Disorders   
2  Swollen Lymph nodes                        No Symptom   
3             Vomiting            Neurological Disorders   
4   Heart Complication                          Vomiting   

                Symptom_6   Symptom_7  
0  Neurological Disorders  No Symptom  
1              No Symptom  No Symptom  
2              No Symptom  No Symptom  
3              No Symptom  No Symptom  


In [5]:
data_encoded = pd.get_dummies(data, columns=data.columns[1:], drop_first=True)
print(data_encoded.head())

      Disease  Symptom_1_Bad breath  Symptom_1_Bleeding of gum  \
0  Tick fever                 False                      False   
1  Tick fever                 False                      False   
2  Tick fever                 False                      False   
3  Tick fever                 False                      False   
4  Tick fever                 False                      False   

   Symptom_1_Bloated Stomach  Symptom_1_Blood in urine  \
0                      False                     False   
1                      False                     False   
2                      False                     False   
3                      False                     False   
4                      False                     False   

   Symptom_1_Breathing Difficulty  Symptom_1_Burping  Symptom_1_Cataracts  \
0                           False              False                False   
1                           False              False                False   
2                      

In [6]:
X = data_encoded.drop('Disease', axis=1)  
y = data_encoded['Disease']  

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

In [9]:
rf_model.fit(X_train, y_train)

In [10]:
y_pred = rf_model.predict(X_test)

In [11]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(classification_report(y_test, y_pred))

Accuracy: 99.92%
                          precision    recall  f1-score   support

               Allergies       1.00      1.00      1.00       412
                 Cancers       1.00      0.99      1.00       392
 Chronic kidney Disease        1.00      1.00      1.00       400
                Diabetes       1.00      1.00      1.00       406
               Distemper       1.00      1.00      1.00       447
Gastrointestinal Disease       0.99      1.00      1.00       382
              Gingitivis       1.00      1.00      1.00       407
              Hepatitis        1.00      0.99      1.00       385
              Parvovirus       1.00      1.00      1.00       398
             Skin Rashes       1.00      1.00      1.00       410
                Tetanus        1.00      1.00      1.00       396
              Tick fever       0.99      1.00      1.00       365

                accuracy                           1.00      4800
               macro avg       1.00      1.00      1.00  

In [12]:



import pickle

# Save model
with open('rf_model.pkl', 'wb') as model_file:
    pickle.dump(rf_model, model_file)
import pickle

# Save the column names as a list
training_columns = list(data_encoded.columns)

# Save the list of columns to a file using pickle
with open('training_columns.pkl', 'wb') as f:
    pickle.dump(training_columns, f)

