In [76]:
!pip install flask flask-cors joblib scikit-learn pandas numpy




In [77]:
from google.colab import files
uploaded = files.upload()


Saving Disease_symptom_and_patient_profile_dataset.csv to Disease_symptom_and_patient_profile_dataset.csv


In [78]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [79]:
df = pd.read_csv("Disease_symptom_and_patient_profile_dataset.csv")

TARGET_COLUMN = "Disease"

print("Dataset shape:", df.shape)
df.head()


Dataset shape: (349, 10)


Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive


In [80]:
df[TARGET_COLUMN].value_counts()


Unnamed: 0_level_0,count
Disease,Unnamed: 1_level_1
Asthma,23
Stroke,16
Osteoporosis,14
Diabetes,10
Hypertension,10
...,...
Schizophrenia,1
Gout,1
Testicular Cancer,1
Tonsillitis,1


In [81]:
MIN_SAMPLES = 6

valid_diseases = df[TARGET_COLUMN].value_counts()
valid_diseases = valid_diseases[valid_diseases >= MIN_SAMPLES].index

df_clean = df[df[TARGET_COLUMN].isin(valid_diseases)]

print("Remaining diseases:", df_clean[TARGET_COLUMN].nunique())
df_clean[TARGET_COLUMN].value_counts()


Remaining diseases: 21


Unnamed: 0_level_0,count
Disease,Unnamed: 1_level_1
Asthma,23
Stroke,16
Osteoporosis,14
Diabetes,10
Migraine,10
Hypertension,10
Bronchitis,8
Pneumonia,8
Influenza,8
Hyperthyroidism,7


In [82]:
X = df_clean.drop(columns=[TARGET_COLUMN])
y = df_clean[TARGET_COLUMN]


In [83]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

print("Disease classes:")
print(label_encoder.classes_)


Disease classes:
['Allergic Rhinitis' 'Anxiety Disorders' 'Asthma' 'Bronchitis'
 'Common Cold' 'Depression' 'Diabetes' 'Eczema' 'Gastroenteritis'
 'Hypertension' 'Hyperthyroidism' 'Hypothyroidism' 'Influenza'
 'Kidney Cancer' 'Liver Cancer' 'Migraine' 'Osteoarthritis' 'Osteoporosis'
 'Pneumonia' 'Rheumatoid Arthritis' 'Stroke']


In [84]:
categorical_cols = X.select_dtypes(include=["object"]).columns
numerical_cols = X.select_dtypes(exclude=["object"]).columns

print("Categorical columns:", len(categorical_cols))
print("Numerical columns:", len(numerical_cols))


Categorical columns: 8
Numerical columns: 1


In [85]:
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols),
        ("num", "passthrough", numerical_cols)
    ]
)

model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("classifier", RandomForestClassifier(
        n_estimators=300,
        random_state=42,
        class_weight="balanced"
    ))
])


In [86]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded,
    test_size=0.2,
    random_state=42
)


In [87]:
model.fit(X_train, y_train)
print("Model trained successfully âœ…")


Model trained successfully âœ…


In [88]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))


Accuracy: 0.21621621621621623

Classification Report:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         0
           1       0.00      0.00      0.00         2
           2       0.75      1.00      0.86         3
           3       0.00      0.00      0.00         2
           4       0.00      0.00      0.00         0
           6       0.00      0.00      0.00         1
           7       0.00      0.00      0.00         1
           8       0.00      0.00      0.00         1
           9       0.33      0.50      0.40         2
          10       0.50      1.00      0.67         2
          11       0.00      0.00      0.00         5
          12       0.00      0.00      0.00         0
          13       0.00      0.00      0.00         2
          14       0.00      0.00      0.00         2
          15       0.00      0.00      0.00         2
          16       0.00      0.00      0.00         2
          17       0.50   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [89]:
def predict_top_3(custom_values):
    sample = {}

    for col in X.columns:
        if col in custom_values:
            sample[col] = custom_values[col]
        else:
            sample[col] = np.random.choice(X[col].unique())

    sample_df = pd.DataFrame([sample])

    probs = model.predict_proba(sample_df)[0]
    top_idx = np.argsort(probs)[::-1][:3]

    return list(zip(
        label_encoder.inverse_transform(top_idx),
        probs[top_idx]
    ))


In [90]:
test_case_1 = {
    "cough": "high",
    "fever": "high",
    "breathlessness": "high",
    "fatigue": "high"
}

test_case_2 = {
    "headache": "high",
    "nausea": "high",
    "vomiting": "mid",
    "fever": "low"
}

test_case_3 = {
    "blood_pressure": "high",
    "chest_pain": "mid",
    "fatigue": "mid"
}

print("Case 1:", predict_top_3(test_case_1))
print("Case 2:", predict_top_3(test_case_2))
print("Case 3:", predict_top_3(test_case_3))


Case 1: [('Stroke', np.float64(0.22)), ('Diabetes', np.float64(0.17)), ('Hypertension', np.float64(0.11076703825929837))]
Case 2: [('Asthma', np.float64(0.38333333333333336)), ('Anxiety Disorders', np.float64(0.11666666666666667)), ('Migraine', np.float64(0.08666666666666667))]
Case 3: [('Bronchitis', np.float64(0.3466666666666667)), ('Common Cold', np.float64(0.14666666666666667)), ('Asthma', np.float64(0.09))]


In [91]:
random_input = {
    col: np.random.choice(X[col].unique())
    for col in X.columns
}

print("Random prediction:", predict_top_3(random_input))


Random prediction: [('Bronchitis', np.float64(0.24666666666666667)), ('Asthma', np.float64(0.23666666666666666)), ('Pneumonia', np.float64(0.16333333333333333))]


In [92]:
import joblib

joblib.dump(model, "disease_prediction_model.pkl")
joblib.dump(label_encoder, "disease_label_encoder.pkl")

print("Model saved successfully ðŸ’¾")


Model saved successfully ðŸ’¾


In [93]:
import joblib

model = joblib.load("disease_prediction_model.pkl")
label_encoder = joblib.load("disease_label_encoder.pkl")

print("âœ… Model loaded")
print("âœ… Encoder loaded")
print("Disease classes:", label_encoder.classes_)


âœ… Model loaded
âœ… Encoder loaded
Disease classes: ['Allergic Rhinitis' 'Anxiety Disorders' 'Asthma' 'Bronchitis'
 'Common Cold' 'Depression' 'Diabetes' 'Eczema' 'Gastroenteritis'
 'Hypertension' 'Hyperthyroidism' 'Hypothyroidism' 'Influenza'
 'Kidney Cancer' 'Liver Cancer' 'Migraine' 'Osteoarthritis' 'Osteoporosis'
 'Pneumonia' 'Rheumatoid Arthritis' 'Stroke']


In [94]:
print("Number of input columns:", len(model.feature_names_in_))
print(model.feature_names_in_[:10])  # show first 10


Number of input columns: 9
['Fever' 'Cough' 'Fatigue' 'Difficulty Breathing' 'Age' 'Gender'
 'Blood Pressure' 'Cholesterol Level' 'Outcome Variable']
