In [93]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [94]:
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)
    df.columns = df.columns.str.strip()  # Clean column names
    
    # Check if 'Disease' column exists
    if 'Disease' not in df.columns:
        raise KeyError("Column 'Disease' not found in dataset.")
    
    # Separate features and target
    X = df.drop('Disease', axis=1)
    y = df['Disease']
    
    # Define numerical and categorical columns
    numerical_cols = ['Dosage (mg)']
    categorical_cols = [col for col in X.columns if col != 'Dosage (mg)']
    
    # Create preprocessing pipelines
    numerical_transformer = Pipeline(steps=[
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ]
    )
    
    # Apply transformations
    X_preprocessed = preprocessor.fit_transform(X)
    
    return X_preprocessed, y


In [95]:
from sklearn.model_selection import train_test_split

def split_data(X, y, test_size=0.2, random_state=42):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test


In [96]:
def train_random_forest(X_train, y_train):
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5],
        'min_samples_leaf': [1, 2]
    }
    
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, scoring='accuracy')
    grid_search.fit(X_train, y_train)
    
    print(f"Best Parameters: {grid_search.best_params_}")
    
    return grid_search.best_estimator_

In [97]:
# Function to evaluate the model
def evaluate_model(model, X_val, y_val, dataset_type="Validation"):
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    report = classification_report(y_val, y_pred)
    
    print(f"{dataset_type} Accuracy: {accuracy * 100:.2f}%")
    print(f"{dataset_type} Classification Report:")
    print(report)

    return accuracy, report

In [98]:
# Function to display feature importance
def display_feature_importance(model, feature_names):
    importances = model.feature_importances_
    feature_importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
    print(feature_importance_df.sort_values(by='Importance', ascending=False))

In [99]:
def random_forest_with_regularization(
    data_path, target_column, test_size=0.2, cv_folds=5,
    n_estimators=100, max_depth=10, min_samples_split=10,
    min_samples_leaf=4, random_state=42
):
    """
    Trains a Random Forest model with pruning and cross-validation to avoid overfitting.
    """
    # Load and preprocess the dataset
    X, y = load_and_preprocess_data(data_path)

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )
    
    # Initialize the Random Forest Classifier with regularization parameters
    rf_classifier = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=random_state
    )

    # Perform cross-validation
    cv_scores = cross_val_score(
        rf_classifier, X_train, y_train, cv=cv_folds
    )
    cv_accuracy = cv_scores.mean() * 100
    
    # Train the model
    rf_classifier.fit(X_train, y_train)

    # Evaluate the model on the test set
    y_pred = rf_classifier.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred) * 100
    classification_rep = classification_report(y_test, y_pred, output_dict=True)
    
    # Prepare results
    results = {
        'cv_accuracy': cv_accuracy,
        'test_accuracy': test_accuracy,
        'classification_report': classification_rep
    }

    return results


In [100]:
# # Main execution function to handle everything
# def main():
#     # File paths of the datasets
#     train_file_path = 'synthetic_detailed_ayurveda_dataset.csv'  # Detailed dataset for training
#     test_file_path = 'synthetic_ayurveda_dataset.csv'  # Simpler dataset for validation/testing
    
#     # Step 1: Load and preprocess both datasets
#     df_train = load_and_preprocess_data(train_file_path)
#     df_test = load_and_preprocess_data(test_file_path)
    
#     # Step 2: Split data (training from detailed, testing from simpler dataset)
#     X_train, X_test, y_train, y_test = split_data(df_train, df_test, target_column='Disease')
    
#     # Step 3: Train Random Forest model using the training dataset
#     rf_classifier = train_random_forest(X_train, y_train)
    
#     # Step 4: Evaluate model on the validation/test dataset
#     evaluate_model(rf_classifier, X_test, y_test, dataset_type="Test")
    
#     # Step 5: Display feature importance from the trained model
#     display_feature_importance(rf_classifier, X_train.columns)

In [101]:
def main():
    # Load and preprocess data
    train_file_path = 'detailed_dataset.csv'
    test_file_path = 'synthetic_ayurveda_dataset.csv'
    
    X_train, y_train = load_and_preprocess_data(train_file_path)
    X_test, y_test = load_and_preprocess_data(test_file_path)
    
    # Split data
    X_train, X_test, y_train, y_test = split_data(X_train, y_train, test_size=0.2, random_state=42)
    
    # Train Random Forest model
    rf_classifier = train_random_forest(X_train, y_train)
    
    # Evaluate the model
    evaluate_model(rf_classifier, X_test, y_test)


In [102]:
results = random_forest_with_regularization(
    data_path='synthetic_detailed_ayurveda_dataset.csv',
    target_column='Disease',
    test_size=0.2,
    cv_folds=5,
    n_estimators=100,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=42
)

# Display results
print(f"Cross-Validation Accuracy: {results['cv_accuracy']:.2f}%")
print(f"Test Accuracy: {results['test_accuracy']:.2f}%")
print("Test Classification Report:")
print(results['classification_report'])

Cross-Validation Accuracy: 11.70%
Test Accuracy: 10.80%
Test Classification Report:
{'Arthritis': {'precision': 0.125, 'recall': 0.06862745098039216, 'f1-score': 0.08860759493670886, 'support': 102}, 'Diabetes': {'precision': 0.0851063829787234, 'recall': 0.04040404040404041, 'f1-score': 0.054794520547945216, 'support': 99}, 'Digestive Disorders': {'precision': 0.09042553191489362, 'recall': 0.14912280701754385, 'f1-score': 0.11258278145695365, 'support': 114}, 'Hypertension': {'precision': 0.10404624277456648, 'recall': 0.1651376146788991, 'f1-score': 0.1276595744680851, 'support': 109}, 'Inflammation': {'precision': 0.14864864864864866, 'recall': 0.08661417322834646, 'f1-score': 0.10945273631840798, 'support': 127}, 'Liver Disorders': {'precision': 0.1111111111111111, 'recall': 0.08461538461538462, 'f1-score': 0.09606986899563319, 'support': 130}, 'Mental Stress': {'precision': 0.11235955056179775, 'recall': 0.18867924528301888, 'f1-score': 0.14084507042253522, 'support': 106}, 'Resp