In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# --- Configuration & Setup ---
FILE_NAME = 'synthetic_medical_symptoms_dataset.csv'
TARGET_COLUMN = 'diagnosis'
RANDOM_SEED = 42 # For Reliability NFR

# --- 1. Data Preprocessing and Feature Engineering Module ---
def preprocess_data(df):
    """Encodes categorical features and scales numerical features."""
    # 1. Handle Categorical Target Variable
    le = LabelEncoder()
    df[TARGET_COLUMN] = le.fit_transform(df[TARGET_COLUMN])

    # Save the mapping for interpretation
    diagnosis_map = dict(zip(df[TARGET_COLUMN], le.classes_))

    # 2. Handle Categorical Feature (Gender)
    df = pd.get_dummies(df, columns=['gender'], drop_first=True)

    # 3. Separate features (X) and target (y)
    X = df.drop(TARGET_COLUMN, axis=1)
    y = df[TARGET_COLUMN]

    # 4. Identify numerical columns for scaling
    numerical_cols = X.select_dtypes(include=np.number).columns

    # 5. Apply Scaling (MinMaxScaler)
    scaler = MinMaxScaler()
    X[numerical_cols] = scaler.fit_transform(X[numerical_cols])

    return X, y, scaler, le, diagnosis_map

# --- 2. Model Training and Evaluation Module ---
def train_and_evaluate_model(X, y):
    """Splits data, trains the model, and prints evaluation metrics."""
    # Split data into training and testing sets (80/20 split)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_SEED, stratify=y
    )

    # Initialize the Random Forest Classifier
    # A robust model often used for classification tasks
    model = RandomForestClassifier(n_estimators=100, random_state=RANDOM_SEED)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, output_dict=True)

    print(f"✅ Model Training Complete (Random Forest Classifier)")
    print("-" * 50)
    print(f"Accuracy on Test Set: {accuracy:.4f}")
    print("-" * 50)
    print("Classification Report:")
    # Convert report to DataFrame for cleaner display
    report_df = pd.DataFrame(report).transpose()
    print(report_df.to_markdown(numalign="left", stralign="left", floatfmt=".3f"))

    # Check Performance NFR
    if accuracy >= 0.85:
        print("\n✨ Non-Functional Requirement (Performance > 85% Accuracy) MET!")
    else:
        print("\n⚠️ Non-Functional Requirement (Performance > 85% Accuracy) FAILED!")

    return model, X_train.columns

# --- 3. Prediction Interface Module (Demonstration) ---
def predict_new_sample(model, scaler, le, diagnosis_map, feature_columns, raw_input_data):
    """
    Simulates prediction on a single new patient record.
    raw_input_data should be a dictionary matching all columns EXCEPT 'diagnosis'.
    """
    print("\n--- 3. Prediction Interface Test ---")

    # Create a DataFrame from the raw input
    new_data = pd.DataFrame([raw_input_data])

    # Apply Gender Encoding (Manual for single sample, matching training columns)
    new_data['gender_Male'] = 1 if new_data['gender'].iloc[0] == 'Male' else 0
    new_data = new_data.drop('gender', axis=1)

    # Ensure all required columns are present (important for dummy variables)
    missing_cols = set(feature_columns) - set(new_data.columns)
    for c in missing_cols:
        new_data[c] = 0 # Assume 0 if not explicitly in input (e.g., 'gender_Female' is implicit)

    # Reorder columns to match the order used during training
    new_data = new_data[feature_columns]

    # Apply Scaling
    numerical_cols = new_data.select_dtypes(include=np.number).columns
    new_data[numerical_cols] = scaler.transform(new_data[numerical_cols])

    # Make the prediction
    prediction_encoded = model.predict(new_data)[0]
    prediction_proba = model.predict_proba(new_data)[0]

    predicted_diagnosis = diagnosis_map[prediction_encoded]
    confidence = prediction_proba[prediction_encoded]

    print("Input Patient Data (Partial):")
    print(f"Age: {raw_input_data['age']}, Gender: {raw_input_data['gender']}, Fever: {raw_input_data['fever']}, Temperature: {raw_input_data['temperature_c']}°C")
    print(f"\nPredicted Diagnosis: {predicted_diagnosis}")
    print(f"Confidence Score: {confidence:.2f}")

# --- Main Execution ---
try:
    # Load the dataset
    df = pd.read_csv(FILE_NAME)

    # Preprocessing
    X_processed, y_encoded, scaler_fitted, le_fitted, diagnosis_map = preprocess_data(df.copy())

    # Training and Evaluation
    rf_model, fitted_feature_columns = train_and_evaluate_model(X_processed, y_encoded)

    # Example test from a patient in the dataset (Row 1 for Dengue)
    sample_data = df.iloc[0].drop(TARGET_COLUMN).to_dict()
    predict_new_sample(rf_model, scaler_fitted, le_fitted, diagnosis_map, fitted_feature_columns, sample_data)

except FileNotFoundError:
    print(f"Error: The file {FILE_NAME} was not found.")
except Exception as e:
    print(f"An error occurred during ML pipeline execution: {e}")

✅ Model Training Complete (Random Forest Classifier)
--------------------------------------------------
Accuracy on Test Set: 0.2400
--------------------------------------------------
Classification Report:
|              | precision   | recall   | f1-score   | support   |
|:-------------|:------------|:---------|:-----------|:----------|
| 0            | 0.222       | 0.310    | 0.259      | 142.000   |
| 1            | 0.231       | 0.242    | 0.237      | 128.000   |
| 2            | 0.264       | 0.400    | 0.318      | 150.000   |
| 3            | 0.278       | 0.054    | 0.090      | 93.000    |
| 4            | 0.174       | 0.046    | 0.073      | 87.000    |
| accuracy     | 0.240       | 0.240    | 0.240      | 0.240     |
| macro avg    | 0.234       | 0.210    | 0.195      | 600.000   |
| weighted avg | 0.236       | 0.240    | 0.216      | 600.000   |

⚠️ Non-Functional Requirement (Performance > 85% Accuracy) FAILED!

--- 3. Prediction Interface Test ---
An error occurred