In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Load your data
train_df = pd.read_csv("Training.csv")

# Separate features and target
X = train_df.drop(columns=['prognosis'])
y = train_df['prognosis']

# Check and fix any infinite or NaN values
X.replace([np.inf, -np.inf], np.nan, inplace=True)
X.fillna(0, inplace=True)

# Encode the target labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Double-check: any remaining NaNs?
assert not np.any(np.isnan(X)), "Still contains NaNs"
assert not np.any(np.isinf(X.to_numpy())), "Still contains Infs"

# Train-test split
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Train RandomForest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predict and evaluate
y_pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred, target_names=le.classes_))


Accuracy: 1.0
                                         precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00        18
                                   AIDS       1.00      1.00      1.00        30
                                   Acne       1.00      1.00      1.00        24
                    Alcoholic hepatitis       1.00      1.00      1.00        25
                                Allergy       1.00      1.00      1.00        24
                              Arthritis       1.00      1.00      1.00        23
                       Bronchial Asthma       1.00      1.00      1.00        33
                   Cervical spondylosis       1.00      1.00      1.00        23
                            Chicken pox       1.00      1.00      1.00        21
                    Chronic cholestasis       1.00      1.00      1.00        15
                            Common Cold       1.00      1.00      1.00        23
             

In [16]:
# Step 1: Load and prepare test data
test_df = pd.read_csv("Testing.csv")

# Step 2: Drop 'prognosis' column if it exists
if 'prognosis' in test_df.columns:
    test_df = test_df.drop(columns=['prognosis'])

# Step 3: Ensure all missing columns from training data are added
# 'X' is the feature set from training (before split)
missing_cols = set(X.columns) - set(test_df.columns)
for col in missing_cols:
    test_df[col] = 0  # Add missing symptom columns with 0s

# Step 4: Reorder test_df columns to match training data
test_df = test_df[X.columns]

# Step 5: Clean test data
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.fillna(0, inplace=True)

# Step 6: Predict
test_preds = model.predict(test_df)
test_diseases = le.inverse_transform(test_preds)

# Step 7: Display predictions
print("Predicted Diseases:")
for i, disease in enumerate(test_diseases, 1):
    print(f"{i}. {disease}")



Predicted Diseases:
1. Fungal infection
2. Allergy
3. GERD
4. Chronic cholestasis
5. Drug Reaction
6. Peptic ulcer diseae
7. AIDS
8. Diabetes 
9. Gastroenteritis
10. Bronchial Asthma
11. Hypertension 
12. Migraine
13. Cervical spondylosis
14. Paralysis (brain hemorrhage)
15. Jaundice
16. Malaria
17. Chicken pox
18. Dengue
19. Typhoid
20. hepatitis A
21. Hepatitis B
22. Hepatitis C
23. Hepatitis D
24. Hepatitis E
25. Alcoholic hepatitis
26. Tuberculosis
27. Common Cold
28. Pneumonia
29. Dimorphic hemmorhoids(piles)
30. Heart attack
31. Varicose veins
32. Hypothyroidism
33. Hyperthyroidism
34. Hypoglycemia
35. Osteoarthristis
36. Arthritis
37. (vertigo) Paroymsal  Positional Vertigo
38. Acne
39. Urinary tract infection
40. Psoriasis
41. Impetigo
42. Impetigo


In [17]:
def predict_disease_from_symptoms(input_symptoms, model, label_encoder, all_symptoms):
    """
    input_symptoms: List of symptoms (strings) provided by the user.
    model: Trained classifier.
    label_encoder: Fitted LabelEncoder for disease names.
    all_symptoms: List of all symptom columns used in training.
    """
    import numpy as np
    input_data = [1 if symptom in input_symptoms else 0 for symptom in all_symptoms]
    input_array = np.array(input_data).reshape(1, -1)
    prediction = model.predict(input_array)
    predicted_disease = label_encoder.inverse_transform(prediction)[0]
    return predicted_disease
