In [1]:
# Cell 1 — Imports & settings
import os
import pandas as pd
import numpy as np
import joblib
import warnings
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier

RANDOM_STATE = 42
ARTIFACTS_DIR = "artifacts_models"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)


In [2]:
# Cell 2 — Dataset paths (change if files in different folder)
datasets = {
    "allopathy": "allopathy_clean.csv",
    "ayurveda":  "ayurveda_clean.csv",
    "homeopathy":"homeopathy_clean.csv"
}

print("Will train models for these datasets:")
for name, path in datasets.items():
    print(f" - {name}: {path}")


Will train models for these datasets:
 - allopathy: allopathy_clean.csv
 - ayurveda: ayurveda_clean.csv
 - homeopathy: homeopathy_clean.csv


In [3]:
# Cell 3 — Define a function that trains for a single dataset and saves artifacts
def train_and_save_for_dataset(name, csv_path):
    print(f"\n=== Processing: {name} ===")
    df = pd.read_csv(csv_path)
    print("Loaded shape:", df.shape)

    # Keep a display lookup table (unique medicine -> display info)
    lookup_cols = ["Medicine_Name","Dosage_Form","Recommended_Dosage","Treatment_Duration","Precautions","Medicine_Approval_Status"]
    lookup = df[lookup_cols].drop_duplicates(subset=["Medicine_Name"]).reset_index(drop=True)

    # FEATURES and TARGET (final chosen features)
    X = df[["Disease_Name","Disease_Severity","Age_Group"]].astype(str)  # all as strings
    y = df["Medicine_Name"].astype(str)

    # Encode target (Medicine_Name) with LabelEncoder and save it
    le = LabelEncoder()
    y_enc = le.fit_transform(y)

    # Preprocessor:
    # - OneHot for Disease_Name and Age_Group (nominal)
    # - OrdinalEncoder for Disease_Severity with explicit order
    severity_order = [["Mild","Moderate","Severe"]]
    preprocessor = ColumnTransformer(
        transformers = [
            ("ohe", OneHotEncoder(handle_unknown="ignore", sparse_output=False), ["Disease_Name","Age_Group"]),
            ("ord", OrdinalEncoder(categories=severity_order), ["Disease_Severity"])
        ],
        remainder="drop"
    )

    # Candidate models (you can add/tune more)
    models = {
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=RANDOM_STATE),
        "RandomForest": RandomForestClassifier(n_estimators=200, random_state=RANDOM_STATE, n_jobs=-1),
        "KNN": KNeighborsClassifier(n_neighbors=7, n_jobs=-1),
        "NaiveBayes": GaussianNB()
    }

    # Create X_transformed for CV evaluation (we'll use pipeline inside CV)
    # Split into train/test (stratify by encoded target)
    X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.20, random_state=RANDOM_STATE, stratify=y_enc)
    print("Train/Test sizes:", X_train.shape, X_test.shape)

    # Cross-validate candidates on training set
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    cv_scores = {}
    for mname, m in models.items():
        pipe = Pipeline([("preproc", preprocessor), ("model", m)])
        try:
            scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="accuracy", n_jobs=-1)
            cv_scores[mname] = scores
            print(f"{mname:12s} CV mean: {scores.mean():.4f} ± {scores.std():.4f}")
        except Exception as e:
            print(f"{mname:12s} ERROR during CV: {e}")

    # Select best model by mean CV accuracy
    best_name = max(cv_scores.items(), key=lambda x: x[1].mean())[0]
    print("Selected best model (by CV mean):", best_name)
    best_model = models[best_name]

    # # Fit pipeline on full training data
    # final_pipe = Pipeline([("preproc", preprocessor), ("model", best_model)])
    # final_pipe.fit(X_train, y_train)

    # Optional: fine-tune before final fit
    final_pipe = tune_model(best_name, preprocessor, X_train, y_train)
    if final_pipe is None:
        final_pipe = Pipeline([("preproc", preprocessor), ("model", models[best_name])])
        final_pipe.fit(X_train, y_train)


    # Evaluate on test set
    y_pred = final_pipe.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"Test accuracy for {name}: {acc:.4f}")
    print("Classification report (test):")
    print(classification_report(y_test, y_pred, zero_division=0))

    # Save artifacts: pipeline (includes preprocessor & model), label encoder, lookup
    model_file = os.path.join(ARTIFACTS_DIR, f"model_{name}.pkl")
    le_file = os.path.join(ARTIFACTS_DIR, f"label_encoder_{name}.pkl")
    preproc_file = os.path.join(ARTIFACTS_DIR, f"preprocessor_{name}.pkl")
    lookup_file = os.path.join(ARTIFACTS_DIR, f"lookup_{name}.pkl")

    # Save whole pipeline (preprocessor + model) for easy inference
    joblib.dump(final_pipe, model_file)
    # Save preprocessor separately too (in case you want only preprocessing)
    joblib.dump(preprocessor, preproc_file)
    # Save label encoder for decoding predicted class index -> medicine name
    joblib.dump(le, le_file)
    # Save lookup table for display information
    joblib.dump(lookup, lookup_file)

    print("Saved artifacts:")
    print(" - pipeline:", model_file)
    print(" - preprocessor:", preproc_file)
    print(" - label encoder:", le_file)
    print(" - lookup:", lookup_file)

    # Return some useful info
    return {
        "name": name,
        "best_model": best_name,
        "test_accuracy": acc,
        "model_file": model_file,
        "le_file": le_file,
        "preproc_file": preproc_file,
        "lookup_file": lookup_file
    }

In [4]:
from sklearn.model_selection import RandomizedSearchCV

def tune_model(best_name, preprocessor, X_train, y_train):
    if best_name == "XGBoost":
        model = XGBClassifier(use_label_encoder=False, eval_metric="mlogloss", random_state=RANDOM_STATE)
        param_dist = {
            "n_estimators": [100, 200, 300],
            "learning_rate": [0.01, 0.05, 0.1, 0.2],
            "max_depth": [3, 4, 5, 6],
            "subsample": [0.7, 0.8, 0.9, 1.0],
            "colsample_bytree": [0.7, 0.8, 1.0]
        }
    elif best_name == "RandomForest":
        model = RandomForestClassifier(random_state=RANDOM_STATE, n_jobs=-1)
        param_dist = {
            "n_estimators": [100, 200, 300, 500],
            "max_depth": [5, 10, 15, None],
            "min_samples_split": [2, 5, 10],
            "min_samples_leaf": [1, 2, 4],
            "bootstrap": [True, False]
        }
    else:
        print("No tuning configured for this model:", best_name)
        return None

    pipe = Pipeline([("preproc", preprocessor), ("model", model)])
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions={"model__" + k: v for k, v in param_dist.items()},
        n_iter=25,
        cv=5,
        scoring="accuracy",
        verbose=1,
        n_jobs=-1,
        random_state=RANDOM_STATE
    )
    search.fit(X_train, y_train)
    print("Best parameters:", search.best_params_)
    print("Best cross-val accuracy:", search.best_score_)
    return search.best_estimator_


In [5]:
# Cell 4 — Execute training for each dataset and collect results
results = {}
for sys_name, csv_path in datasets.items():
    if not os.path.exists(csv_path):
        raise FileNotFoundError(f"{csv_path} not found in working directory.")
    res = train_and_save_for_dataset(sys_name, csv_path)
    results[sys_name] = res

# Print summary
print("\n=== Summary ===")
for name, info in results.items():
    print(f"{name}: model={info['best_model']}, test_acc={info['test_accuracy']:.4f}, model_file={info['model_file']}")



=== Processing: allopathy ===
Loaded shape: (1100, 9)
Train/Test sizes: (880, 3) (220, 3)
XGBoost      CV mean: 0.6966 ± 0.0099
RandomForest CV mean: 0.6898 ± 0.0085
KNN          CV mean: 0.6920 ± 0.0174
NaiveBayes   CV mean: 0.4011 ± 0.0464
Selected best model (by CV mean): XGBoost
Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best parameters: {'model__subsample': 0.9, 'model__n_estimators': 200, 'model__max_depth': 6, 'model__learning_rate': 0.01, 'model__colsample_bytree': 0.8}
Best cross-val accuracy: 0.7090909090909092
Test accuracy for allopathy: 0.7136
Classification report (test):
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         2
           1       0.71      1.00      0.83        17
           2       0.00      0.00      0.00         2
           3       0.00      0.00      0.00         3
           4       0.00      0.00      0.00         3
           5       0.00      0.00      0.00         4
        

In [6]:
# Cell 5 — Example: load the allopathy pipeline and label encoder and test a few samples
from pprint import pprint

# change 'allopathy' to any of 'ayurveda' or 'homeopathy' to test others
sys_to_test = "homeopathy"

pipe = joblib.load(os.path.join(ARTIFACTS_DIR, f"model_{sys_to_test}.pkl"))
le = joblib.load(os.path.join(ARTIFACTS_DIR, f"label_encoder_{sys_to_test}.pkl"))
lookup = joblib.load(os.path.join(ARTIFACTS_DIR, f"lookup_{sys_to_test}.pkl"))

samples = [
    {"Disease_Name":"Fever", "Disease_Severity":"Moderate", "Age_Group":"Adult"},
    {"Disease_Name":"Asthma", "Disease_Severity":"Severe", "Age_Group":"Child"},
    {"Disease_Name":"Diabetes", "Disease_Severity":"Mild", "Age_Group":"Elderly"},
]

for s in samples:
    Xs = pd.DataFrame([s])
    pred_enc = pipe.predict(Xs)[0]
    pred_med = le.inverse_transform([int(pred_enc)])[0]  # decode
    rec = lookup[lookup["Medicine_Name"] == pred_med]
    display_info = rec.to_dict(orient="records")[0] if not rec.empty else {}
    print("Input:", s)
    print("Predicted Medicine:", pred_med)
    print("Display info (sample):")
    pprint(display_info)
    print("-" * 60)


Input: {'Disease_Name': 'Fever', 'Disease_Severity': 'Moderate', 'Age_Group': 'Adult'}
Predicted Medicine: Belladonna
Display info (sample):
{'Dosage_Form': 'Syrup',
 'Medicine_Approval_Status': 'HPUS Listed',
 'Medicine_Name': 'Belladonna',
 'Precautions': 'Do not exceed dosage',
 'Recommended_Dosage': '1 tsp twice daily',
 'Treatment_Duration': 'Continuous'}
------------------------------------------------------------
Input: {'Disease_Name': 'Asthma', 'Disease_Severity': 'Severe', 'Age_Group': 'Child'}
Predicted Medicine: Belladonna
Display info (sample):
{'Dosage_Form': 'Syrup',
 'Medicine_Approval_Status': 'HPUS Listed',
 'Medicine_Name': 'Belladonna',
 'Precautions': 'Do not exceed dosage',
 'Recommended_Dosage': '1 tsp twice daily',
 'Treatment_Duration': 'Continuous'}
------------------------------------------------------------
Input: {'Disease_Name': 'Diabetes', 'Disease_Severity': 'Mild', 'Age_Group': 'Elderly'}
Predicted Medicine: Belladonna
Display info (sample):
{'Dosage_Fo