In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import io

# Step 1: Import data from local device
from google.colab import files
uploaded = files.upload()  # Use file upload prompt in Colab or replace this with local file path if not using Colab

# Load the dataset
for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Step 2: Check the data structure
print(df.head())  # View the first few rows of the dataset

# Step 3: Clean and preprocess the symptoms data
# Handle missing values and empty symptoms
df['symptoms'] = df['symptoms'].fillna('no symptoms')

# Join symptoms into a single string for each row (if symptoms are in lists or multiple entries)
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

# Step 4: Vectorize the symptoms column using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

# Step 5: Encode the target variable (medical conditions)
le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the Decision Tree Model
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# Step 8: Evaluate the Model
y_pred = dt.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Model Accuracy: {accuracy * 100:.2f}%")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Calculate Precision, Recall, F1 Score for Macro and Weighted averages
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro")
weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted")

# Display Macro and Weighted Average Metrics
print("\nMacro Average Metrics:")
print(f"Precision: {macro_precision:.2f}, Recall: {macro_recall:.2f}, F1 Score: {macro_f1:.2f}")

print("\nWeighted Average Metrics:")
print(f"Precision: {weighted_precision:.2f}, Recall: {weighted_recall:.2f}, F1 Score: {weighted_f1:.2f}")

# Step 9: Make Predictions for New Symptoms
def predict_condition(new_symptoms):
    # Join the input symptoms into a single string
    symptoms_str = ' '.join(new_symptoms)

    # Vectorize the input symptoms
    symptom_vector = tfidf_vectorizer.transform([symptoms_str])

    # Predict the condition
    predicted_condition_idx = dt.predict(symptom_vector)[0]
    predicted_condition = le_condition.inverse_transform([predicted_condition_idx])[0]

    # Get the probability of prediction (probability of the predicted class)
    predicted_prob = dt.predict_proba(symptom_vector)[0]
    predicted_confidence = predicted_prob[predicted_condition_idx]

    return predicted_condition, predicted_confidence

# Example usage:
new_symptoms_input = ['night sweats', 'swollen lymph nodes']
predicted_condition, confidence = predict_condition(new_symptoms_input)

print(f"\nPredicted Disease for input symptoms {new_symptoms_input}: {predicted_condition}")
print(f"Prediction Confidence: {confidence:.2f}")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import io

# Step 1: Import data from local device
from google.colab import files
uploaded = files.upload()  # Use file upload prompt in Colab or replace this with local file path if not using Colab

# Load the dataset
for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Step 2: Check the data structure
print(df.head())  # View the first few rows of the dataset

# Step 3: Clean and preprocess the symptoms data
# Handle missing values and empty symptoms
df['symptoms'] = df['symptoms'].fillna('no symptoms')

# Join symptoms into a single string for each row (if symptoms are in lists or multiple entries)
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

# Step 4: Vectorize the symptoms column using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

# Step 5: Encode the target variable (medical conditions)
le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the Random Forest Model
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Step 8: Evaluate the Model
y_pred = rf.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Random Forest Model Accuracy: {accuracy * 100:.2f}%")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Calculate Precision, Recall, F1 Score for Macro and Weighted averages
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro")
weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted")

# Display Macro and Weighted Average Metrics
print("\nMacro Average Metrics:")
print(f"Precision: {macro_precision:.2f}, Recall: {macro_recall:.2f}, F1 Score: {macro_f1:.2f}")

print("\nWeighted Average Metrics:")
print(f"Precision: {weighted_precision:.2f}, Recall: {weighted_recall:.2f}, F1 Score: {weighted_f1:.2f}")

# Step 9: Make Predictions for New Symptoms
def predict_condition(new_symptoms):
    # Join the input symptoms into a single string
    symptoms_str = ' '.join(new_symptoms)

    # Vectorize the input symptoms
    symptom_vector = tfidf_vectorizer.transform([symptoms_str])

    # Predict the condition
    predicted_condition_idx = rf.predict(symptom_vector)[0]
    predicted_condition = le_condition.inverse_transform([predicted_condition_idx])[0]

    # Get the probability of prediction (probability of the predicted class)
    predicted_prob = rf.predict_proba(symptom_vector)[0]
    predicted_confidence = predicted_prob[predicted_condition_idx]

    return predicted_condition, predicted_confidence

# Example usage:
new_symptoms_input = ['healthy', 'no symptoms']
predicted_condition, confidence = predict_condition(new_symptoms_input)

print(f"\nPredicted Disease for input symptoms {new_symptoms_input}: {predicted_condition}")
print(f"Prediction Confidence: {confidence:.2f}")



import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import io

# Step 1: Import data from local device
from google.colab import files
uploaded = files.upload()  # Use file upload prompt in Colab or replace this with local file path if not using Colab

# Load the dataset
for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Step 2: Check the data structure
print(df.head())  # View the first few rows of the dataset

# Step 3: Clean and preprocess the symptoms data
# Handle missing values and empty symptoms
df['symptoms'] = df['symptoms'].fillna('no symptoms')

# Join symptoms into a single string for each row (if symptoms are in lists or multiple entries)
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

# Step 4: Vectorize the symptoms column using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

# Step 5: Encode the target variable (medical conditions)
le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the SVM Model
svm = SVC(kernel='linear', random_state=42, probability=True)  # Using linear kernel for simplicity
svm.fit(X_train, y_train)

# Step 8: Evaluate the Model
y_pred = svm.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"SVM Model Accuracy: {accuracy * 100:.2f}%")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Calculate Precision, Recall, F1 Score for Macro and Weighted averages
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro")
weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted")

# Display Macro and Weighted Average Metrics
print("\nMacro Average Metrics:")
print(f"Precision: {macro_precision:.2f}, Recall: {macro_recall:.2f}, F1 Score: {macro_f1:.2f}")

print("\nWeighted Average Metrics:")
print(f"Precision: {weighted_precision:.2f}, Recall: {weighted_recall:.2f}, F1 Score: {weighted_f1:.2f}")

# Step 9: Make Predictions for New Symptoms
def predict_condition(new_symptoms):
    # Join the input symptoms into a single string
    symptoms_str = ' '.join(new_symptoms)

    # Vectorize the input symptoms
    symptom_vector = tfidf_vectorizer.transform([symptoms_str])

    # Predict the condition
    predicted_condition_idx = svm.predict(symptom_vector)[0]
    predicted_condition = le_condition.inverse_transform([predicted_condition_idx])[0]

    # Get the probability of prediction (probability of the predicted class)
    predicted_prob = svm.predict_proba(symptom_vector)[0]
    predicted_confidence = predicted_prob[predicted_condition_idx]

    return predicted_condition, predicted_confidence

# Example usage:
new_symptoms_input = ['chronic cough', 'wheezing', 'shortness of breath']
predicted_condition, confidence = predict_condition(new_symptoms_input)

print(f"\nPredicted Disease for input symptoms {new_symptoms_input}: {predicted_condition}")
print(f"Prediction Confidence: {confidence:.2f}")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import io

# Step 1: Import data from local device
from google.colab import files
uploaded = files.upload()  # Use file upload prompt in Colab or replace this with local file path if not using Colab

# Load the dataset
for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Step 2: Check the data structure
print(df.head())  # View the first few rows of the dataset

# Step 3: Clean and preprocess the symptoms data
# Handle missing values and empty symptoms
df['symptoms'] = df['symptoms'].fillna('no symptoms')

# Join symptoms into a single string for each row (if symptoms are in lists or multiple entries)
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

# Step 4: Vectorize the symptoms column using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

# Step 5: Encode the target variable (medical conditions)
le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the Gradient Boosting Model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Step 8: Evaluate the Model
y_pred = gb_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Gradient Boosting Model Accuracy: {accuracy * 100:.2f}%")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Calculate Precision, Recall, F1 Score for Macro and Weighted averages
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro")
weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted")

# Display Macro and Weighted Average Metrics
print("\nMacro Average Metrics:")
print(f"Precision: {macro_precision:.2f}, Recall: {macro_recall:.2f}, F1 Score: {macro_f1:.2f}")

print("\nWeighted Average Metrics:")
print(f"Precision: {weighted_precision:.2f}, Recall: {weighted_recall:.2f}, F1 Score: {weighted_f1:.2f}")

# Step 9: Make Predictions for New Symptoms
def predict_condition(new_symptoms):
    # Join the input symptoms into a single string
    symptoms_str = ' '.join(new_symptoms)

    # Vectorize the input symptoms
    symptom_vector = tfidf_vectorizer.transform([symptoms_str])

    # Predict the condition
    predicted_condition_idx = gb_model.predict(symptom_vector)[0]
    predicted_condition = le_condition.inverse_transform([predicted_condition_idx])[0]

    # Get the probability of prediction (probability of the predicted class)
    predicted_prob = gb_model.predict_proba(symptom_vector)[0]
    predicted_confidence = predicted_prob[predicted_condition_idx]

    return predicted_condition, predicted_confidence

# Example usage:
new_symptoms_input = ['bloating', 'nausea', 'vomiting']
predicted_condition, confidence = predict_condition(new_symptoms_input)

print(f"\nPredicted Disease for input symptoms {new_symptoms_input}: {predicted_condition}")
print(f"Prediction Confidence: {confidence:.2f}")

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import io

# Step 1: Import data from local device
from google.colab import files
uploaded = files.upload()  # Use file upload prompt in Colab or replace this with local file path if not using Colab

# Load the dataset
for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Step 2: Check the data structure
print(df.head())  # View the first few rows of the dataset

# Step 3: Clean and preprocess the symptoms data
# Handle missing values and empty symptoms
df['symptoms'] = df['symptoms'].fillna('no symptoms')

# Join symptoms into a single string for each row (if symptoms are in lists or multiple entries)
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

# Step 4: Vectorize the symptoms column using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

# Step 5: Encode the target variable (medical conditions)
le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the KNN Model
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)

# Step 8: Evaluate the Model
y_pred = knn.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"KNN Model Accuracy: {accuracy * 100:.2f}%")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Calculate Precision, Recall, F1 Score for Macro and Weighted averages
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro")
weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted")

# Display Macro and Weighted Average Metrics
print("\nMacro Average Metrics:")
print(f"Precision: {macro_precision:.2f}, Recall: {macro_recall:.2f}, F1 Score: {macro_f1:.2f}")

print("\nWeighted Average Metrics:")
print(f"Precision: {weighted_precision:.2f}, Recall: {weighted_recall:.2f}, F1 Score: {weighted_f1:.2f}")

# Step 9: Make Predictions for New Symptoms
def predict_condition(new_symptoms):
    # Join the input symptoms into a single string
    symptoms_str = ' '.join(new_symptoms)

    # Vectorize the input symptoms
    symptom_vector = tfidf_vectorizer.transform([symptoms_str])

    # Predict the condition
    predicted_condition_idx = knn.predict(symptom_vector)[0]
    predicted_condition = le_condition.inverse_transform([predicted_condition_idx])[0]

    # Get the probability of prediction (distance to nearest neighbors)
    neighbors = knn.kneighbors(symptom_vector, n_neighbors=3)
    predicted_confidence = 1 / (1 + neighbors[0].mean())  # Inverse of average distance as confidence score

    return predicted_condition, predicted_confidence

# Example usage:
new_symptoms_input = ['fever', 'chills', 'muscle pain']
predicted_condition, confidence = predict_condition(new_symptoms_input)

print(f"\nPredicted Disease for input symptoms {new_symptoms_input}: {predicted_condition}")
print(f"Prediction Confidence: {confidence:.2f}")

"""## **Majority Voting(Hard)**"""

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd  # Import pandas
import io # Import io
from google.colab import files # Import files for Colab
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support
)
from sklearn.tree import DecisionTreeClassifier # Import models again for clarity in this block
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder # Import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer # Import TfidfVectorizer


# --- Data Loading and Preprocessing (Duplicate from previous blocks) ---
# This ensures X and y are defined for this block
uploaded = files.upload()  # Use file upload prompt in Colab or replace this with local file path if not using Colab

# Load the dataset
for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Handle missing values and empty symptoms
df['symptoms'] = df['symptoms'].fillna('no symptoms')

# Join symptoms into a single string for each row (if symptoms are in lists or multiple entries)
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

# Vectorize the symptoms column using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

# Encode the target variable (medical conditions)
le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize individual models (Ensure these match the models trained earlier if reusing them)
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svm = SVC(probability=True, kernel='rbf', random_state=42)  # required for soft voting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)

# --- Create Voting Ensemble (Hard Voting) ---
estimators = [('dt', dt), ('rf', rf), ('svm', svm), ('gb', gb_model), ('knn', knn)]

# Fit the individual models within this block before creating the VotingClassifier
# This is crucial if the individual model objects from previous blocks are not available
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
svm.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
knn.fit(X_train, y_train)


voting_hard = VotingClassifier(estimators=estimators, voting='hard')
voting_hard.fit(X_train, y_train)
y_pred_hard = voting_hard.predict(X_test)

# --- Evaluation: Hard Voting ---
print("\n--- Hard Voting Ensemble Results ---")
print(f"Accuracy: {accuracy_score(y_test, y_pred_hard) * 100:.2f}%")
print("\nClassification Report:")
# Ensure le_condition is defined and fitted for target_names
print(classification_report(y_test, y_pred_hard, target_names=le_condition.classes_))

# Confusion Matrix
plt.figure(figsize=(10, 7))
# Ensure le_condition is defined and fitted for xticklabels and yticklabels
sns.heatmap(confusion_matrix(y_test, y_pred_hard), annot=True, fmt="d", cmap="Blues",
            xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (Hard Voting)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Additional Metrics (Hard)
macro_p, macro_r, macro_f1, _ = precision_recall_fscore_support(y_test, y_pred_hard, average='macro')
weighted_p, weighted_r, weighted_f1, _ = precision_recall_fscore_support(y_test, y_pred_hard, average='weighted')
print(f"\nMacro - Precision: {macro_p:.2f}, Recall: {macro_r:.2f}, F1: {macro_f1:.2f}")
print(f"Weighted - Precision: {weighted_p:.2f}, Recall: {weighted_r:.2f}, F1: {weighted_f1:.2f}")

"""## **Soft Voting**"""

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd  # Import pandas
import io # Import io
from google.colab import files # Import files for Colab
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


# --- Data Loading and Preprocessing ---
# This ensures X and y are defined for this block
uploaded = files.upload()  # Use file upload prompt in Colab or replace this with local file path if not using Colab

# Load the dataset
for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Handle missing values and empty symptoms
df['symptoms'] = df['symptoms'].fillna('no symptoms')

# Join symptoms into a single string for each row (if symptoms are in lists or multiple entries)
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

# Vectorize the symptoms column using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

# Encode the target variable (medical conditions)
le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initialize individual models
# Ensure models support predict_proba for soft voting
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svm = SVC(probability=True, kernel='rbf', random_state=42)  # probability=True is essential for soft voting
gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)
knn = KNeighborsClassifier(n_neighbors=5) # KNN supports predict_proba by default

# Create list of estimators
estimators = [
    ('dt', dt),
    ('rf', rf),
    ('svm', svm),
    ('gb', gb_model),
    ('knn', knn)
]

# Fit the individual models within this block
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
svm.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
knn.fit(X_train, y_train)


# --- Create Voting Ensemble (Soft Voting) ---
# Use voting='soft' and ensure all base estimators support predict_proba
voting_soft = VotingClassifier(estimators=estimators, voting='soft')
voting_soft.fit(X_train, y_train)
y_pred_soft = voting_soft.predict(X_test)

# --- Evaluation: Soft Voting ---
print("\n--- Soft Voting Ensemble Results ---")
accuracy_soft = accuracy_score(y_test, y_pred_soft)
print(f"Accuracy: {accuracy_soft * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_soft, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix_soft = confusion_matrix(y_test, y_pred_soft)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_soft, annot=True, fmt="d", cmap="Blues",
            xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (Soft Voting)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Additional Metrics (Soft)
macro_p_soft, macro_r_soft, macro_f1_soft, _ = precision_recall_fscore_support(y_test, y_pred_soft, average='macro')
weighted_p_soft, weighted_r_soft, weighted_f1_soft, _ = precision_recall_fscore_support(y_test, y_pred_soft, average='weighted')
print(f"\nMacro - Precision: {macro_p_soft:.2f}, Recall: {macro_r_soft:.2f}, F1: {macro_f1_soft:.2f}")
print(f"Weighted - Precision: {weighted_p_soft:.2f}, Recall: {weighted_r_soft:.2f}, F1: {weighted_f1_soft:.2f}")

"""## **Gradient Boosting Block**"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, precision_recall_fscore_support
from sklearn.feature_extraction.text import TfidfVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
import io

# Step 1: Import data from local device
from google.colab import files
uploaded = files.upload()  # Use file upload prompt in Colab or replace this with local file path if not using Colab

# Load the dataset
for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

# Step 2: Check the data structure
print(df.head())  # View the first few rows of the dataset

# Step 3: Clean and preprocess the symptoms data
# Handle missing values and empty symptoms
df['symptoms'] = df['symptoms'].fillna('no symptoms')

# Join symptoms into a single string for each row (if symptoms are in lists or multiple entries)
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

# Step 4: Vectorize the symptoms column using TF-IDF
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

# Step 5: Encode the target variable (medical conditions)
le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

# Step 6: Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 7: Train the Gradient Boosting Model
gb_model = GradientBoostingClassifier(random_state=42)
gb_model.fit(X_train, y_train)

# Step 8: Evaluate the Model
y_pred = gb_model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Gradient Boosting Model Accuracy: {accuracy * 100:.2f}%")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Calculate Precision, Recall, F1 Score for Macro and Weighted averages
precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average=None)
macro_precision, macro_recall, macro_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="macro")
weighted_precision, weighted_recall, weighted_f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted")

# Display Macro and Weighted Average Metrics
print("\nMacro Average Metrics:")
print(f"Precision: {macro_precision:.2f}, Recall: {macro_recall:.2f}, F1 Score: {macro_f1:.2f}")

print("\nWeighted Average Metrics:")
print(f"Precision: {weighted_precision:.2f}, Recall: {weighted_recall:.2f}, F1 Score: {weighted_f1:.2f}")

# Step 9: Make Predictions for New Symptoms
def predict_condition(new_symptoms):
    # Join the input symptoms into a single string
    symptoms_str = ' '.join(new_symptoms)

    # Vectorize the input symptoms
    symptom_vector = tfidf_vectorizer.transform([symptoms_str])

    # Predict the condition
    predicted_condition_idx = gb_model.predict(symptom_vector)[0]
    predicted_condition = le_condition.inverse_transform([predicted_condition_idx])[0]

    # Get the probability of prediction (probability of the predicted class)
    predicted_prob = gb_model.predict_proba(symptom_vector)[0]
    predicted_confidence = predicted_prob[predicted_condition_idx]

    return predicted_condition, predicted_confidence

# Example usage:
new_symptoms_input = ['bloating', 'nausea', 'vomiting']
predicted_condition, confidence = predict_condition(new_symptoms_input)

print(f"\nPredicted Disease for input symptoms {new_symptoms_input}: {predicted_condition}")
print(f"Prediction Confidence: {confidence:.2f}")

"""# **Without noise**"""

import pandas as pd
import io
from google.colab import files
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
import seaborn as sns


# --- Data Loading and Preprocessing ---
uploaded = files.upload()

for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

df['symptoms'] = df['symptoms'].fillna('no symptoms')
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Hyperparameter Tuning for Random Forest ---
# Define the model
rf = RandomForestClassifier(random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Set up GridSearchCV
# cv=5 means 5-fold cross-validation
# scoring='accuracy' is the evaluation metric
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Print the best parameters and the best score
print("Best parameters found: ", grid_search.best_params_)
print("Best cross-validation accuracy: {:.2f}".format(grid_search.best_score_))

# Get the best model from the grid search
best_rf_model = grid_search.best_estimator_

# --- Evaluate the Best Model on the Test Set ---
y_pred = best_rf_model.predict(X_test)

print("\nEvaluation of Best Random Forest Model on Test Set:")
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le_condition.classes_))

# You can also plot the confusion matrix for the best model
conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (Tuned Random Forest)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# You would repeat this tuning process for each individual model
# and then potentially use the best-tuned models in the VotingClassifier.

"""**Hyperparameter Tuning for Random Forest (Repeat for other models)**

**Weighted Soft Voting Ensemble**
"""

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import io
from google.colab import files
from sklearn.ensemble import VotingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    precision_recall_fscore_support
)
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# --- Data Loading and Preprocessing ---
# (Ensure this block is run to define X, y, X_train, X_test, y_train, y_test, le_condition)
uploaded = files.upload()

for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

df['symptoms'] = df['symptoms'].fillna('no symptoms')
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# --- Initialize and Train Individual Models ---
# It's recommended to use the *tuned* best models here if you ran the tuning step
dt = DecisionTreeClassifier(random_state=42) # Replace with best_dt_model if tuned
rf = RandomForestClassifier(random_state=42) # Replace with best_rf_model if tuned
svm = SVC(probability=True, kernel='rbf', random_state=42) # Replace with best_svm_model if tuned
gb_model = GradientBoostingClassifier(random_state=42) # Replace with best_gb_model if tuned
knn = KNeighborsClassifier(n_neighbors=5) # Replace with best_knn_model if tuned

# Fit the individual models
dt.fit(X_train, y_train)
rf.fit(X_train, y_train)
svm.fit(X_train, y_train)
gb_model.fit(X_train, y_train)
knn.fit(X_train, y_train)

# --- Create and Train Weighted Soft Voting Ensemble ---
estimators = [
    ('dt', dt),
    ('rf', rf),
    ('svm', svm),
    ('gb', gb_model),
    ('knn', knn)
]

# Assign weights based on perceived performance (you'd typically use cross-validation scores)
# Example weights - adjust based on your model's tuning results!
weights = [0.1, 0.3, 0.2, 0.3, 0.1] # Sum of weights does not need to be 1

voting_soft_weighted = VotingClassifier(estimators=estimators, voting='soft', weights=weights)

print("\nStarting Weighted Soft Voting Ensemble training...")
voting_soft_weighted.fit(X_train, y_train)
y_pred_soft_weighted = voting_soft_weighted.predict(X_test)

# --- Evaluation: Weighted Soft Voting ---
print("\n--- Weighted Soft Voting Ensemble Results ---")
accuracy_soft_weighted = accuracy_score(y_test, y_pred_soft_weighted)
print(f"Accuracy: {accuracy_soft_weighted * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_soft_weighted, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix_soft_weighted = confusion_matrix(y_test, y_pred_soft_weighted)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_soft_weighted, annot=True, fmt="d", cmap="Blues",
            xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (Weighted Soft Voting)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

# Additional Metrics (Weighted Soft)
macro_p_sw, macro_r_sw, macro_f1_sw, _ = precision_recall_fscore_support(y_test, y_pred_soft_weighted, average='macro')
weighted_p_sw, weighted_r_sw, weighted_f1_sw, _ = precision_recall_fscore_support(y_test, y_pred_soft_weighted, average='weighted')
print(f"\nMacro - Precision: {macro_p_sw:.2f}, Recall: {macro_r_sw:.2f}, F1: {macro_f1_sw:.2f}")
print(f"Weighted - Precision: {weighted_p_sw:.2f}, Recall: {weighted_r_sw:.2f}, F1: {weighted_f1_sw:.2f}")

"""## **AdaBoost Classifier**"""

import pandas as pd
import io
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier # AdaBoost uses Decision Trees as base estimators
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


# --- Data Loading and Preprocessing ---
# (Ensure this block is run to define X, y, X_train, X_test, y_train, y_test, le_condition)
uploaded = files.upload()

for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

df['symptoms'] = df['symptoms'].fillna('no symptoms')
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Initialize and Train AdaBoost Classifier ---
# Base estimator can be changed, but Decision Tree is common
# You might want to tune n_estimators and learning_rate
adaboost = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1), # Use a decision stump
                              n_estimators=100, # Number of boosting rounds
                              learning_rate=1.0,
                              random_state=42)

print("\nStarting AdaBoost training...")
adaboost.fit(X_train, y_train)
y_pred_ada = adaboost.predict(X_test)

# --- Evaluation: AdaBoost ---
print("\n--- AdaBoost Classifier Results ---")
accuracy_ada = accuracy_score(y_test, y_pred_ada)
print(f"Accuracy: {accuracy_ada * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_ada, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix_ada = confusion_matrix(y_test, y_pred_ada)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_ada, annot=True, fmt="d", cmap="Blues",
            xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (AdaBoost)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

"""## **XGBoost Classifier**"""

import pandas as pd
import io
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb # Import XGBoost
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


# --- Data Loading and Preprocessing ---
# (Ensure this block is run to define X, y, X_train, X_test, y_train, y_test, le_condition)
uploaded = files.upload()

for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

df['symptoms'] = df['symptoms'].fillna('no symptoms')
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Initialize and Train XGBoost Classifier ---
# XGBoost parameters can be heavily tuned. This is a basic setup.
# objective='multi:softprob' for multi-class classification with probability outputs
# use_label_encoder=False is recommended to avoid FutureWarning
# eval_metric='mlogloss' is a common metric for multi-class
xgb_model = xgb.XGBClassifier(objective='multi:softprob',
                            num_class=len(le_condition.classes_), # Specify number of classes
                            n_estimators=100,
                            learning_rate=0.1,
                            random_state=42,
                            use_label_encoder=False,
                            eval_metric='mlogloss')

print("\nStarting XGBoost training...")
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)

# --- Evaluation: XGBoost ---
print("\n--- XGBoost Classifier Results ---")
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)
print(f"Accuracy: {accuracy_xgb * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix_xgb = confusion_matrix(y_test, y_pred_xgb)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_xgb, annot=True, fmt="d", cmap="Blues",
            xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (XGBoost)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

"""# Stacking Classifier"""

import pandas as pd
import io
from google.colab import files
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import StackingClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression # A common meta-model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns


# --- Data Loading and Preprocessing ---
# (Ensure this block is run to define X, y, X_train, X_test, y_train, y_test, le_condition)
uploaded = files.upload()

for file_name in uploaded.keys():
    df = pd.read_csv(io.BytesIO(uploaded[file_name]))

df['symptoms'] = df['symptoms'].fillna('no symptoms')
df['symptoms'] = df['symptoms'].apply(lambda x: ' '.join(str(x).split(',')))

tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=500)
X = tfidf_vectorizer.fit_transform(df['symptoms'])

le_condition = LabelEncoder()
y = le_condition.fit_transform(df['medical_condition'].fillna("Unknown"))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- Initialize Base Models for Stacking ---
# It's highly recommended to use TUNED base models here
estimators = [
    ('dt', DecisionTreeClassifier(random_state=42)), # Replace with best_dt_model if tuned
    ('rf', RandomForestClassifier(random_state=42)), # Replace with best_rf_model if tuned
    # SVM can be slow with probability=True in stacking, consider other models
    # ('svm', SVC(probability=True, kernel='rbf', random_state=42)),
    ('gb', GradientBoostingClassifier(random_state=42)), # Replace with best_gb_model if tuned
    ('knn', KNeighborsClassifier(n_neighbors=5)) # Replace with best_knn_model if tuned
]

# Define the meta-model (final estimator)
# Logistic Regression is a simple and often effective choice
final_estimator = LogisticRegression(multi_class='auto', solver='liblinear', random_state=42)

# --- Create and Train Stacking Classifier ---
# cv parameter splits the training data to train base models and the meta-model
stacking_model = StackingClassifier(estimators=estimators,
                                    final_estimator=final_estimator,
                                    cv=5) # Use 5-fold cross-validation

print("\nStarting Stacking Ensemble training...")
# StackingClassifier will train the base estimators and the final estimator
stacking_model.fit(X_train, y_train)
y_pred_stack = stacking_model.predict(X_test)

# --- Evaluation: Stacking ---
print("\n--- Stacking Classifier Results ---")
accuracy_stack = accuracy_score(y_test, y_pred_stack)
print(f"Accuracy: {accuracy_stack * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred_stack, target_names=le_condition.classes_))

# Confusion Matrix
conf_matrix_stack = confusion_matrix(y_test, y_pred_stack)
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix_stack, annot=True, fmt="d", cmap="Blues",
            xticklabels=le_condition.classes_, yticklabels=le_condition.classes_)
plt.title("Confusion Matrix (Stacking)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()
