In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import time

def load_and_preprocess_data(file_path, preprocessor=None):
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()  # Clean column names

    # Check if 'Disease' column exists
    if 'Disease' not in df.columns:
        raise KeyError("Column 'Disease' not found in dataset.")

    # Create a new feature: SymptomSeverity
    df['SymptomSeverity'] = df['Symptoms/Condition'].apply(lambda x: len(x.split()))

    # Separate features and target
    X = df.drop('Disease', axis=1)
    y = df['Disease']

    # Define numerical and categorical columns
    numerical_cols = ['Dosage (mg)', 'SymptomSeverity']
    categorical_cols = [col for col in X.columns if col not in numerical_cols]

    # Create preprocessing pipelines
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    if preprocessor is None:
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numerical_transformer, numerical_cols),
                ('cat', categorical_transformer, categorical_cols)
            ]
        )
        # Fit the preprocessor on the data
        X_preprocessed = preprocessor.fit_transform(X)
    else:
        # Apply the existing preprocessor
        X_preprocessed = preprocessor.transform(X)

    return X_preprocessed, y, preprocessor

def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

def train_random_forest(X_train, y_train):
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2],
        'criterion': ['gini'],
        'max_features': ['auto']
    }

    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='accuracy', verbose=3)
    grid_search.fit(X_train, y_train)

    print(f"Best Parameters: {grid_search.best_params_}")

    return grid_search.best_estimator_

# Function to evaluate the model
def evaluate_model(model, X, y, dataset_type="Validation"):
    y_pred = model.predict(X)
    accuracy = accuracy_score(y, y_pred)
    report = classification_report(y, y_pred)

    print(f"{dataset_type} Accuracy: {accuracy * 100:.2f}%")
    print(f"{dataset_type} Classification Report:")
    print(report)

    return accuracy, report

# Function to display feature importance
def display_feature_importance(model, feature_names):
    importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    print(feature_importance_df.sort_values(by='Importance', ascending=False))

def main():
    start_time = time.time()  # Start time for profiling

    # File paths of the datasets
    train_file_path = 'synthetic_detailed_ayurveda_dataset.csv'  # Detailed dataset for training
    test_file_path = 'synthetic_ayurveda_dataset.csv'  # Simpler dataset for validation/testing

    # Step 1: Load and preprocess the training dataset
    X_train, y_train, preprocessor = load_and_preprocess_data(train_file_path)

    # Step 2: Split training data into training and validation sets
    X_train, X_val, y_train, y_val = split_data(X_train, y_train)

    # Step 3: Train Random Forest model using the training dataset
    rf_classifier = train_random_forest(X_train, y_train)

    # Step 4: Load and preprocess the test dataset using the existing preprocessor
    X_test, y_test, _ = load_and_preprocess_data(test_file_path, preprocessor)

    # Step 5: Evaluate the model on the validation set
    evaluate_model(rf_classifier, X_val, y_val, dataset_type="Validation")

    # Step 6: Evaluate the model on the test set
    evaluate_model(rf_classifier, X_test, y_test, dataset_type="Test")

    # Step 7: Display feature importance from the trained model
    feature_names = [f"Feature {i}" for i in range(X_train.shape[1])]  # Replace with actual feature names if available
    display_feature_importance(rf_classifier, feature_names)

    end_time = time.time()  # End time for profiling
    print(f"Execution Time: {end_time - start_time:.2f} seconds")

if __name__ == "__main__":
    main()


Fitting 3 folds for each of 16 candidates, totalling 48 fits


  warn(


Best Parameters: {'criterion': 'gini', 'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Validation Accuracy: 13.00%
Validation Classification Report:
                     precision    recall  f1-score   support

          Arthritis       0.18      0.07      0.10       102
           Diabetes       0.11      0.06      0.08        99
Digestive Disorders       0.12      0.18      0.14       114
       Hypertension       0.12      0.17      0.14       109
       Inflammation       0.15      0.09      0.12       127
    Liver Disorders       0.14      0.12      0.12       130
      Mental Stress       0.13      0.23      0.17       106
 Respiratory Issues       0.12      0.17      0.14       106
      Skin Diseases       0.16      0.08      0.11       107

           accuracy                           0.13      1000
          macro avg       0.14      0.13      0.12      1000
       weighted avg       0.14      0.13      0.12      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
