**Import Necessary Libraries**

In [1]:
# Importing required libraries for data manipulation, model training, and evaluation
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization, Add, Input
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from tensorflow.keras.utils import to_categorical
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Confirm libraries are loaded
print("Libraries imported successfully.")


Libraries imported successfully.


**Load Dataset**

In [2]:
# Loading dataset from CSV file to analyze and preprocess
data_path = 'Patient_Health_Data.csv'
patient_data = pd.read_csv(data_path)

# Print the first few rows to understand the structure of the data
print("Dataset loaded. Here are the first few rows:")
print(patient_data.head())


Dataset loaded. Here are the first few rows:
  Patient_ID  Height_cm  Weight_kg Blood_Pressure  Temperature_C  Heart_Rate  \
0      P0001      174.0       59.0         120/80           36.6        63.0   
1      P0002        NaN       67.0         130/85           37.0        68.0   
2      P0003      176.0       58.0         140/90           37.0        65.0   
3      P0004      185.0       65.0         125/82           37.2        69.0   
4      P0005      167.0       41.0         135/88           36.3        72.0   

              Symptoms Existing_Conditions      Disease_Predictions  \
0           chest pain            Diabetes            Heart Disease   
1  shortness of breath        Hypertension  Coronary Artery Disease   
2              fatigue    High Cholesterol               Arrhythmia   
3            dizziness                 NaN                      NaN   
4         palpitations              Asthma             Hypertension   

  Laboratory_Test_Results  Cholesterol_mg_dL  B

**Display Data Structure**

In [3]:
print("Dataset structure:")
print(patient_data.info())

Dataset structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 14 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Patient_ID                    2000 non-null   object 
 1   Height_cm                     1915 non-null   float64
 2   Weight_kg                     1914 non-null   float64
 3   Blood_Pressure                1640 non-null   object 
 4   Temperature_C                 1892 non-null   float64
 5   Heart_Rate                    1903 non-null   float64
 6   Symptoms                      1562 non-null   object 
 7   Existing_Conditions           1488 non-null   object 
 8   Disease_Predictions           1488 non-null   object 
 9   Laboratory_Test_Results       1505 non-null   object 
 10  Cholesterol_mg_dL             1899 non-null   float64
 11  Blood_Sugar_mg_dL             1880 non-null   float64
 12  Family_History_Heart_Disease  1891 non-null

In [6]:
unique_symptoms = patient_data['Laboratory_Test_Results'].unique()
print("Unique values in the 'Symptoms' column:")
print(unique_symptoms)

Unique values in the 'Symptoms' column:
['High Cholesterol' nan 'Normal' 'High Blood Sugar' 'Low Iron']


In [None]:
# After loading data
print("\nInitial Data Overview:")
print("-" * 50)
print("Dataset shape:", patient_data.shape)
print("\nFirst few rows of the dataset:")
display(patient_data.head())

# After preprocessing
print("\nMissing Values Analysis:")
print("-" * 50)
missing_values = patient_data.isnull().sum()
print(missing_values[missing_values > 0])

# Visualize missing values
plt.figure(figsize=(12, 6))
sns.heatmap(patient_data.isnull(), yticklabels=False, cbar=False, cmap='viridis')
plt.title('Missing Values Heatmap')
plt.show()

# Add distribution plots for numerical features
numerical_cols = patient_data.select_dtypes(include=['float64', 'int64']).columns
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(3, 4, i)
    sns.histplot(patient_data[col], kde=True)
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

**Splitting 'Blood_Pressure' Column**

In [None]:
# Split the 'Blood_Pressure' column into 'Systolic_BP' and 'Diastolic_BP'
# This separation helps in individual analysis of each blood pressure component
patient_data[['Systolic_BP', 'Diastolic_BP']] = patient_data['Blood_Pressure'].str.split('/', expand=True)

# Verify the split by viewing the updated DataFrame
print("Blood pressure split into 'Systolic_BP' and 'Diastolic_BP':")
print(patient_data[['Systolic_BP', 'Diastolic_BP']].head())
print(patient_data[['Systolic_BP', 'Diastolic_BP']].info())


**Convert Blood Pressure Columns to Numeric**

In [None]:
# Convert the new blood pressure columns to numeric and handle non-numeric entries by setting them to NaN
# This is necessary for further analysis since we can’t process string values in these columns
patient_data['Systolic_BP'] = pd.to_numeric(patient_data['Systolic_BP'], errors='coerce')
patient_data['Diastolic_BP'] = pd.to_numeric(patient_data['Diastolic_BP'], errors='coerce')

# Check the data types and any potential NaNs introduced during conversion
print("Converted 'Systolic_BP' and 'Diastolic_BP' to numeric types:")
print(patient_data[['Systolic_BP', 'Diastolic_BP']].info())


**Drop the Original 'Blood_Pressure' Column**

In [None]:
# Drop the original 'Blood_Pressure' column as it is no longer needed
patient_data = patient_data.drop(columns=['Blood_Pressure'])

# Confirm the column is dropped
print("'Blood_Pressure' column dropped:")
print(patient_data.head())


**Fill Missing Values for Numerical Columns**

In [None]:
# Fill missing values in numerical columns with the mean of each column
patient_data.fillna(patient_data.median(numeric_only=True), inplace=True)

# Check for remaining missing values in the DataFrame
print("Filled missing values in numerical columns. Checking for nulls:")
print(patient_data.isnull().sum())


**Fill Missing Values for Categorical Columns**

In [None]:
# Fill missing values in categorical columns with the mode (most frequent value) of each column
for col in patient_data.select_dtypes(include=['object']).columns:
    patient_data[col].fillna(patient_data[col].mode()[0], inplace=True)

# Confirm no missing values remain in categorical columns
print("Filled missing values in categorical columns. Checking for nulls:")
print(patient_data.isnull().sum())


**Separate Features and Target Variable**

In [None]:
# Separate the features and target variable for model training
X = patient_data.drop(columns=['Disease_Predictions', 'Patient_ID'])
y = patient_data['Disease_Predictions']

# Display the shapes to ensure correct separation
print("Separated features (X) and target (y):")
print("Features shape:", X.shape)
print("Target shape:", y.shape)


**Encode Target Variable**

In [None]:
# Encode the categorical target variable using LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Print unique classes to confirm encoding
print("Encoded target variable:")
print("Classes:", label_encoder.classes_)


**One-Hot Encode and Standardize Features**

In [None]:
# One-hot encode categorical features and standardize numerical features for better model performance
X = pd.get_dummies(X, drop_first=True)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Confirm the scaling and encoding by displaying shape and sample data
print("Features encoded and standardized. Sample data:")
print(X_scaled[:5])
print(X_scaled.shape)
# Visualize the first few rows of the scaled and encoded features
pd.DataFrame(X_scaled, columns=X.columns).head()


**Apply SMOTE to Balance Classes**

In [None]:
# Use SMOTE to balance the classes by oversampling the minority classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_scaled, y_encoded)

# Confirm the balanced classes
print("Applied SMOTE to balance classes. New class distribution:")
print(np.bincount(y_resampled))


**Split into Training and Test Sets**

In [None]:
X_train, X_temp, y_train, y_temp = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)
print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)
print(y_test.shape)

**Model 1: K-Nearest Neighbors (KNN)**

In [None]:
# Initialize the KNN model
knn = KNeighborsClassifier(n_neighbors=5)

# Fit the model to the training data
knn.fit(X_train, y_train)

# Predict on the validation set
knn_val_pred = knn.predict(X_val)

# Evaluate KNN on the validation set
print("KNN Validation Accuracy:", accuracy_score(y_val, knn_val_pred))
print("KNN Validation Classification Report:\n", classification_report(y_val, knn_val_pred))

# Predict on the test set
knn_test_pred = knn.predict(X_test)

# Evaluate KNN on the test set
print("KNN Test Accuracy:", accuracy_score(y_test, knn_test_pred))
print("KNN Test Classification Report:\n", classification_report(y_test, knn_test_pred))

# Plot confusion matrix for the test set
plt.figure(figsize=(6,5))
sns.heatmap(confusion_matrix(y_test, knn_test_pred), annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("KNN Model - Confusion Matrix (Test Set)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
# Define the parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Initialize the KNN model
knn = KNeighborsClassifier()

# Initialize GridSearchCV with cross-validation
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params = grid_search.best_params_
best_knn = grid_search.best_estimator_

# Print the best parameters
print("Best parameters found by GridSearchCV:", best_params)

# Predict on the validation set using the best KNN model
knn_val_pred = best_knn.predict(X_val)

# Evaluate the best KNN model on the validation set
print("Best KNN Validation Accuracy:", accuracy_score(y_val, knn_val_pred))
print("Best KNN Validation Classification Report:\n", classification_report(y_val, knn_val_pred))

# Predict on the test set using the best KNN model
knn_test_pred = best_knn.predict(X_test)

# Evaluate the best KNN model on the test set
print("Best KNN Test Accuracy:", accuracy_score(y_test, knn_test_pred))
print("Best KNN Test Classification Report:\n", classification_report(y_test, knn_test_pred))

# Plot confusion matrix for the test set
plt.figure(figsize=(6,5))
sns.heatmap(confusion_matrix(y_test, knn_test_pred), annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("Best KNN Model - Confusion Matrix (Test Set)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
import joblib

# Save the scaler
scaler_filename = 'scaler.pkl'
joblib.dump(scaler, scaler_filename)

# Save the label encoder
label_encoder_filename = 'label_encoder.pkl'
joblib.dump(label_encoder, label_encoder_filename)

# Save the SMOTE object
smote_filename = 'smote.pkl'
joblib.dump(smote, smote_filename)

# Save the KNN model
knn_model_filename = 'knn_model.pkl'
joblib.dump(best_knn, knn_model_filename)

print("Preprocessing steps and KNN model saved successfully.")

Accuracy is low, so lets move to RandomForest

**Model 2: RandomForest**

In [None]:
# Initialize the RandomForest model
rf = RandomForestClassifier(random_state=42)

# Fit the model to the training data
rf.fit(X_train, y_train)

# Predict on the validation set
rf_val_pred = rf.predict(X_val)

# Evaluate RandomForest on the validation set
print("RandomForest Validation Accuracy:", accuracy_score(y_val, rf_val_pred))
print("RandomForest Validation Classification Report:\n", classification_report(y_val, rf_val_pred))

# Predict on the test set
rf_test_pred = rf.predict(X_test)

# Evaluate RandomForest on the test set
print("RandomForest Test Accuracy:", accuracy_score(y_test, rf_test_pred))
print("RandomForest Test Classification Report:\n", classification_report(y_test, rf_test_pred))

# Plot confusion matrix for the test set
plt.figure(figsize=(6,5))
sns.heatmap(confusion_matrix(y_test, rf_test_pred), annot=True, fmt="d", cmap="Greens", cbar=False)
plt.title("RandomForest Model - Confusion Matrix (Test Set)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
def plot_feature_importance(model, feature_names, title):
    """
    Plot feature importance for tree-based models
    """
    importance = model.feature_importances_
    indices = np.argsort(importance)[::-1]
    
    plt.figure(figsize=(12, 6))
    plt.title(f"Feature Importance ({title})")
    plt.bar(range(len(importance)), importance[indices])
    plt.xticks(range(len(importance)), [feature_names[i] for i in indices], rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

# Plot feature importance for Random Forest
plot_feature_importance(rf, X.columns, "Random Forest")

In [None]:
# Define the parameter grid for GridSearchCV
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

# Initialize the RandomForest model
rf = RandomForestClassifier(random_state=42)

# Initialize GridSearchCV with cross-validation
grid_search_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)

# Fit GridSearchCV to the training data
grid_search_rf.fit(X_train, y_train)

# Get the best parameters and best estimator
best_params_rf = grid_search_rf.best_params_
best_rf = grid_search_rf.best_estimator_

# Print the best parameters
print("Best parameters found by GridSearchCV for RandomForest:", best_params_rf)

# Predict on the validation set using the best RandomForest model
rf_val_pred = best_rf.predict(X_val)

# Evaluate the best RandomForest model on the validation set
print("Best RandomForest Validation Accuracy:", accuracy_score(y_val, rf_val_pred))
print("Best RandomForest Validation Classification Report:\n", classification_report(y_val, rf_val_pred))

# Predict on the test set using the best RandomForest model
rf_test_pred = best_rf.predict(X_test)

# Evaluate the best RandomForest model on the test set
print("Best RandomForest Test Accuracy:", accuracy_score(y_test, rf_test_pred))
print("Best RandomForest Test Classification Report:\n", classification_report(y_test, rf_test_pred))

# Plot confusion matrix for the test set
plt.figure(figsize=(6,5))
sns.heatmap(confusion_matrix(y_test, rf_test_pred), annot=True, fmt="d", cmap="Greens", cbar=False)
plt.title("Best RandomForest Model - Confusion Matrix (Test Set)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

In [None]:
# Add RandomForest predictions as a new feature to the training and test sets
X_train_with_rf = np.hstack((X_train, best_rf.predict(X_train).reshape(-1, 1)))
X_val_with_rf = np.hstack((X_val, best_rf.predict(X_val).reshape(-1, 1)))
X_test_with_rf = np.hstack((X_test, best_rf.predict(X_test).reshape(-1, 1)))

# Initialize the XGBoost model
xgb_with_rf = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss')

# Fit the model to the training data with the additional RandomForest feature
xgb_with_rf.fit(X_train_with_rf, y_train)

# Predict on the validation set
xgb_with_rf_val_pred = xgb_with_rf.predict(X_val_with_rf)

# Evaluate the XGBoost model with the additional RandomForest feature on the validation set
print("XGBoost with RandomForest Feature Validation Accuracy:", accuracy_score(y_val, xgb_with_rf_val_pred))
print("XGBoost with RandomForest Feature Validation Classification Report:\n", classification_report(y_val, xgb_with_rf_val_pred))

# Predict on the test set
xgb_with_rf_test_pred = xgb_with_rf.predict(X_test_with_rf)

# Evaluate the XGBoost model with the additional RandomForest feature on the test set
print("XGBoost with RandomForest Feature Test Accuracy:", accuracy_score(y_test, xgb_with_rf_test_pred))
print("XGBoost with RandomForest Feature Test Classification Report:\n", classification_report(y_test, xgb_with_rf_test_pred))

# Plot confusion matrix for the test set
plt.figure(figsize=(6,5))
sns.heatmap(confusion_matrix(y_test, xgb_with_rf_test_pred), annot=True, fmt="d", cmap="Blues", cbar=False)
plt.title("XGBoost with RandomForest Feature - Confusion Matrix (Test Set)")
plt.xlabel("Predicted")
plt.ylabel("True")
plt.show()

Accuracy is still low,so lets move to CNN

**Reshape for CNN Input**

In [None]:
# Reshape the data for CNN input, adding a third dimension for channels (required by Conv1D)
X_train_reshaped = np.expand_dims(X_train, axis=2)
X_val_reshaped = np.expand_dims(X_val, axis=2)
X_test_reshaped = np.expand_dims(X_test, axis=2)

# Confirm reshaping
print("Data reshaped for CNN input:")
print("X_train reshaped shape:", X_train_reshaped.shape)
print("X_val reshaped shape:", X_val_reshaped.shape)
print("X_test reshaped shape:", X_test_reshaped.shape)


**Convert Target Variable to Categorical**

In [None]:
# Convert target variable to categorical format for multi-class classification
y_train_categorical = to_categorical(y_train)
y_val_categorical = to_categorical(y_val)
y_test_categorical = to_categorical(y_test)

# Confirm conversion
print("Converted target to categorical format:")
print("y_train shape:", y_train_categorical.shape)
print("y_val shape:", y_val_categorical.shape)
print("y_test shape:", y_test_categorical.shape)


**Define the Deep Residual CNN Mode**

In [None]:
# Function to build a deep residual CNN model with Conv1D layers
def build_residual_cnn(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    
    # First Conv1D Block
    x = Conv1D(64, kernel_size=3, activation='relu', padding='same', kernel_regularizer=l2(0.001))(input_layer)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.3)(x)

    # Second Conv1D Block with residual connection
    residual = Conv1D(128, kernel_size=3, padding='same')(x)  # Adjust dimensions with padding
    x = Conv1D(128, kernel_size=3, activation='relu', padding='same', kernel_regularizer=l2(0.001))(x)
    x = BatchNormalization()(x)
    x = Add()([x, residual])  # Add residual connection
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.4)(x)

    # Third Conv1D Block
    x = Conv1D(256, kernel_size=3, activation='relu', padding='same', kernel_regularizer=l2(0.001))(x)
    x = BatchNormalization()(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Dropout(0.5)(x)

    # Flatten and Dense Layers
    x = Flatten()(x)
    x = Dense(512, activation='relu', kernel_regularizer=l2(0.001))(x)
    x = Dropout(0.5)(x)
    output_layer = Dense(num_classes, activation='softmax')(x)

    model = Model(inputs=input_layer, outputs=output_layer)
    return model

# Build the model
model = build_residual_cnn(input_shape=(X_train_reshaped.shape[1], 1), num_classes=y_train_categorical.shape[1])

# Display model summary
model.summary()
print("Deep residual CNN model defined.")


**Compile the Model**

In [None]:
# Compile the model with Adam optimizer and categorical crossentropy loss for multi-class classification
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])
print("Model compiled.")


**Set Up Callbacks**

In [None]:
# Define callbacks for early stopping, model checkpoint, and reducing learning rate on plateau
callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ModelCheckpoint('best_residual_model.keras', monitor='val_loss', save_best_only=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1)
]

print("Callbacks set up.")


**Train the Model**

In [None]:
# Train the model with the training data and validate on test data using callbacks
history = model.fit(X_train_reshaped, y_train_categorical, epochs=100, batch_size=64, validation_data=(X_test_reshaped, y_test_categorical), callbacks=callbacks, verbose=1)
print("Model training completed.")


In [None]:
def plot_training_history(history):
    """
    Plot training history for the CNN model
    """
    plt.figure(figsize=(12, 4))
    
    # Plot accuracy
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend()
    
    # Plot loss
    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Training Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

# After training the CNN
plot_training_history(history)

**Load Best Weights and Evaluate**

In [None]:
# Load the best model weights from training
model.load_weights('best_residual_model.keras')
print("Loaded best model weights.")

# Evaluate CNN using categorical predictions
cnn_pred_prob = model.predict(X_test_reshaped)
cnn_pred = np.argmax(cnn_pred_prob, axis=1)
y_test_classes = np.argmax(y_test_categorical, axis=1)

# Calculate and display metrics
print("CNN Accuracy:", accuracy_score(y_test_classes, cnn_pred))
print("CNN Classification Report:\n", classification_report(y_test_classes, cnn_pred))

# Plot confusion matrix
plt.figure(figsize=(8,6))
cm = confusion_matrix(y_test_classes, cnn_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Reds", cbar=False)
plt.title("CNN Model - Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()

**Summary & Model Comparison**

In [None]:
def plot_model_comparison(models_results):
    """
    Plot comparison of model performances
    """
    plt.figure(figsize=(12, 6))
    
    # Accuracy comparison
    accuracies = []
    names = []
    
    # KNN results
    knn_val_accuracy = accuracy_score(y_val, knn_val_pred)
    accuracies.append(knn_val_accuracy)
    names.append('KNN')
    
    # Random Forest results  
    rf_val_accuracy = accuracy_score(y_val, rf_val_pred)
    accuracies.append(rf_val_accuracy)
    names.append('Random Forest')

    # XGBoost with RF results
    xgb_rf_val_accuracy = accuracy_score(y_val, xgb_with_rf_val_pred)
    accuracies.append(xgb_rf_val_accuracy)
    names.append('XGBoost+RF')

    # CNN results
    cnn_val_accuracy = accuracy_score(np.argmax(y_val_categorical, axis=1), 
                                    np.argmax(model.predict(X_val_reshaped), axis=1))
    accuracies.append(cnn_val_accuracy)
    names.append('CNN')

    # Plot accuracies
    plt.bar(range(len(accuracies)), accuracies)
    plt.xticks(range(len(accuracies)), names, rotation=45)
    plt.title('Model Validation Accuracy Comparison')
    plt.ylabel('Accuracy')
    plt.ylim([0, 1])
    
    # Add value labels on top of bars
    for i, v in enumerate(accuracies):
        plt.text(i, v + 0.01, f'{v:.3f}', 
                ha='center', va='bottom')
    
    plt.tight_layout()
    plt.show()

# Store model results
models_results = []

In [None]:
def evaluate_model(model, y_true, y_pred, model_name):
    """
    Evaluate model performance with multiple metrics
    """
    # Convert predictions to class labels if using CNN
    if model_name == "CNN":
        y_true = np.argmax(y_true, axis=1)
        y_pred = np.argmax(y_pred, axis=1)
        
    accuracy = accuracy_score(y_true, y_pred)
    report = classification_report(y_true, y_pred)
    conf_matrix = confusion_matrix(y_true, y_pred)
    
    print(f"\n{model_name} Evaluation Results:")
    print("-" * 50)
    print(f"Accuracy: {accuracy:.4f}")
    print("\nClassification Report:")
    print(report)
    
    # Plot confusion matrix with improved visualization
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=True)
    plt.title(f'{model_name} Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()
    
    return {
        'name': model_name,
        'model': model,
        'accuracy': accuracy,
        'confusion_matrix': conf_matrix
    }

# Evaluate models and store results
models_results = [
    evaluate_model(knn, y_test, knn_test_pred, "KNN"),
    evaluate_model(best_rf, y_test, rf_test_pred, "Random Forest"),
    evaluate_model(xgb_with_rf, y_test, xgb_with_rf_test_pred, "XGBoost+RF"),
    evaluate_model(model, y_test_categorical, cnn_pred_prob, "CNN")
]

In [None]:
print("\nModel Comparison Summary:")
print("=" * 50)
for result in models_results:
    print(f"{result['name']}:")
    print(f"  - Accuracy: {result['accuracy']:.4f}")

best_model = max(models_results, key=lambda x: x['accuracy'])
print(f"\nBest performing model: {best_model['name']} with accuracy {best_model['accuracy']:.4f}")

# Add correlation matrix visualization
plt.figure(figsize=(12, 8))
numerical_cols = X.select_dtypes(include=['float64']).columns
correlation_matrix = patient_data[numerical_cols].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.show()

# Add learning curves for each model
def plot_sklearn_learning_curve(estimator, title, X, y, ylim=None, cv=5,
                              n_jobs=None, train_sizes=np.linspace(.1, 1.0, 5)):
    """
    Plot learning curve for sklearn models
    """
    plt.figure(figsize=(10, 6))
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    
    train_sizes, train_scores, test_scores = learning_curve(
        estimator, X, y, cv=cv, n_jobs=n_jobs, 
        train_sizes=train_sizes,
        scoring='accuracy'
    )
    
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    
    plt.grid()
    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
    plt.legend(loc="best")
    plt.show()

def plot_cnn_learning_curve(history):
    """
    Plot learning curve for CNN model using training history
    """
    plt.figure(figsize=(10, 6))
    plt.plot(history.history['accuracy'], label='Training Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('CNN Model Learning Curve')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.legend(loc='lower right')
    plt.grid(True)
    plt.show()

# Plot learning curves based on model type
for result in models_results:
    if result['name'] == 'CNN':
        plot_cnn_learning_curve(history)
    else:
        plot_sklearn_learning_curve(
            result['model'],
            f"Learning Curve ({result['name']})",
            X_train, y_train
        )

# Final conclusions
print("\nFinal Analysis and Conclusions:")
print("=" * 50)
print(f"1. Best performing model: {best_model['name']}")
print(f"2. Overall accuracy achieved: {best_model['accuracy']:.4f}")

**Saving the Model**

In [None]:
import joblib

# Save the XGBoost+RF model to a .pkl file
joblib.dump(xgb_with_rf, 'xgboost_rf_model.pkl')

print("Model saved successfully.")

**User Interface for the Application**

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from ipywidgets import widgets, Layout, HBox, VBox
from IPython.display import display, HTML, clear_output
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.models import load_model
import joblib

# Load your dataset and set up encoders and scaler
data_path = 'Patient_Health_Data.csv'
patient_data = pd.read_csv(data_path)

# Drop the 'Patient_ID' column if it exists
patient_data = patient_data.drop(columns=['Patient_ID'], errors='ignore')

# Splitting 'Blood_Pressure' into 'Systolic_BP' and 'Diastolic_BP'
patient_data[['Systolic_BP', 'Diastolic_BP']] = patient_data['Blood_Pressure'].str.split('/', expand=True)

# Convert new columns to numeric, handle non-numeric entries by setting them to NaN
patient_data['Systolic_BP'] = pd.to_numeric(patient_data['Systolic_BP'], errors='coerce')
patient_data['Diastolic_BP'] = pd.to_numeric(patient_data['Diastolic_BP'], errors='coerce')

# Drop the original 'Blood_Pressure' column as it's no longer needed
patient_data = patient_data.drop(columns=['Blood_Pressure'])

# Prepare label encoders and scaler based on training
label_encoder = LabelEncoder()
patient_data['Disease_Predictions'] = patient_data['Disease_Predictions'].fillna(patient_data['Disease_Predictions'].mode()[0])
y_encoded = label_encoder.fit_transform(patient_data['Disease_Predictions'])

# Fill missing values for numerical columns
patient_data.fillna(patient_data.median(numeric_only=True), inplace=True)

# Fill missing values for categorical columns
for col in patient_data.select_dtypes(include=['object']).columns:
    patient_data[col] = patient_data[col].fillna(patient_data[col].mode()[0])

# Define features and scaler based on training data
numeric_features = patient_data.select_dtypes(include=[np.number]).columns.drop(['Disease_Predictions'], errors='ignore').tolist()
scaler = StandardScaler()
X_scaled = scaler.fit_transform(patient_data[numeric_features])

# Load pre-trained XGBoost+RF model
xgb_with_rf = joblib.load('xgboost_rf_model.pkl')

# Display title and introduction with enhanced styling
display(HTML("<h1 style='color: #00796B; font-size: 30px; text-align: center;'>Patient Health Prediction System</h1>"))
display(HTML("<p style='color: #616161; font-size: 16px; text-align: center;'>Please provide the patient's health details below and click <strong style='color: #388E3C;'>Predict Disease</strong> for a health assessment.</p>"))

# Set up widgets with color themes, tooltips, and placeholders
feature_widgets = {}

# Create interactive sliders for numerical features with colorful handles and descriptions
for feature in numeric_features:
    tooltip = f"Set the value for {feature.replace('_', ' ')}"
    if feature == 'Systolic_BP':
        feature_widgets[feature] = widgets.IntSlider(value=120, min=40, max=200, step=1, description="Systolic BP",
                                                     style={'description_width': 'initial', 'handle_color': '#29B6F6'}, layout=Layout(width='100%'), tooltip=tooltip)
    elif feature == 'Diastolic_BP':
        feature_widgets[feature] = widgets.IntSlider(value=80, min=40, max=120, step=1, description="Diastolic BP",
                                                     style={'description_width': 'initial', 'handle_color': '#EF5350'}, layout=Layout(width='100%'), tooltip=tooltip)
    elif feature == 'Temperature_C':
        feature_widgets[feature] = widgets.FloatSlider(value=37.0, min=30.0, max=44.0, step=0.1, description="Temperature",
                                                       style={'description_width': 'initial', 'handle_color': '#66BB6A'}, layout=Layout(width='100%'), tooltip=tooltip)
    else:
        min_val = patient_data[feature].min()
        max_val = patient_data[feature].max()
        feature_widgets[feature] = widgets.IntSlider(value=int((min_val + max_val) / 2),
                                                     min=int(min_val),
                                                     max=int(max_val),
                                                     step=1,
                                                     description=feature.replace('_', ' ').title(),
                                                     style={'description_width': 'initial', 'handle_color': '#FFA726'}, layout=Layout(width='100%'), tooltip=tooltip)

# Dropdown menus for categorical features with placeholders and colors
for feature in patient_data.select_dtypes(include=['object']).columns:
    if feature != 'Disease_Predictions':
        unique_values = patient_data[feature].dropna().unique()
        options = [('Select...', None)] + [(str(val), str(val)) for val in sorted(unique_values)]
        feature_widgets[feature] = widgets.Dropdown(options=options, description=feature.replace('_', ' ').title(),
                                                    style={'description_width': 'initial'}, layout=Layout(width='100%'))

# Organize the input widgets into two columns with stylish borders
numeric_widgets = VBox([feature_widgets[feature] for feature in numeric_features], layout=Layout(width='48%', border='2px solid #B2DFDB', padding='20px', margin='10px'))
categorical_widgets = VBox([feature_widgets[feature] for feature in patient_data.select_dtypes(include=['object']).columns if feature != 'Disease_Predictions'],
                           layout=Layout(width='48%', border='2px solid #FFCDD2', padding='20px', margin='10px'))

# Display organized input sections side by side
display(HBox([numeric_widgets, categorical_widgets]))

# Prediction button with enhanced styling and center alignment
predict_button = widgets.Button(description="Predict Disease", button_style='success', layout=Layout(width='200px', padding='10px', height='100%'), tooltip="Click to predict disease")
output = widgets.Output(layout=Layout(border='2px solid #00796B', padding='20px', margin='15px 0', background_color='#E0F2F1'))

# Define the prediction function with output formatting
def on_predict_button_clicked(b):
    with output:
        clear_output()
        
        try:
            # Load the model
            xgb_rf_model = joblib.load('xgboost_rf_model.pkl')
            
            # Gather selected feature values
            selected_features = {feature: widget.value for feature, widget in feature_widgets.items()}
            input_data = pd.DataFrame([selected_features])
            
            # Fill missing values
            for col in numeric_features:
                if col in input_data.columns and input_data[col].isnull().any():
                    input_data[col].fillna(patient_data[col].median(), inplace=True)
            
            for col in patient_data.select_dtypes(include=['object']).columns:
                if col in input_data.columns and col != 'Disease_Predictions' and input_data[col].isnull().any():
                    input_data[col].fillna(patient_data[col].mode()[0], inplace=True)

            # Get reference data without one-hot encoding first
            reference_data = patient_data.drop(columns=['Disease_Predictions'])
            
            # Create full one-hot encoding for categorical columns
            categorical_cols = patient_data.select_dtypes(include=['object']).columns.drop('Disease_Predictions')
            
            # One-hot encode without dropping first
            input_encoded = pd.get_dummies(input_data, columns=categorical_cols)
            reference_encoded = pd.get_dummies(reference_data, columns=categorical_cols)
            
            # Add any missing columns from reference data
            for col in reference_encoded.columns:
                if col not in input_encoded.columns:
                    input_encoded[col] = 0
            
            # Keep only columns that were in training data
            input_encoded = input_encoded[reference_encoded.columns]
            
            # Scale numeric features
            input_encoded[numeric_features] = scaler.transform(input_encoded[numeric_features])
            
            # Drop extra column to match 24 features
            if input_encoded.shape[1] > xgb_rf_model.n_features_in_:
                cols_to_drop = input_encoded.columns[-(input_encoded.shape[1] - xgb_rf_model.n_features_in_):]
                input_encoded = input_encoded.drop(columns=cols_to_drop)
            
            # Verify final shape
            #print(f"Final input shape: {input_encoded.shape}")
            #print(f"Expected features: {xgb_rf_model.n_features_in_}")
            
            # Make prediction
            predicted_class = xgb_rf_model.predict(input_encoded)[0]
            disease = label_encoder.inverse_transform([predicted_class])
            
            # Display prediction
            display(HTML(f"<h3 style='color: #00796B;'>Prediction Result</h3>"))
            display(HTML(f"<p style='font-size: 16px; color: #424242;'>Based on the input parameters, the predicted disease is: <strong style='color: #D32F2F;'>{disease[0]}</strong></p>"))
            
        except Exception as e:
            display(HTML(f"<p style='color: #D32F2F;'>Error making prediction: {str(e)}</p>"))
            print(f"Debug info:")
            print(f"Input shape: {input_encoded.shape}")
            print(f"Expected features: {xgb_rf_model.n_features_in_}")
            print(f"Available columns: {input_encoded.columns.tolist()}")

# Set up button interaction
predict_button.on_click(on_predict_button_clicked)

# Display the button in the center and output area below it
display(HBox([predict_button], layout=Layout(justify_content='center')))
display(output)
