In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the data
data = pd.read_csv("heart_disease.csv")

# Separate numerical and categorical columns based on the data types you shared
numerical_cols = ['Age', 'Blood Pressure', 'Cholesterol Level', 'BMI', 'Sleep Hours', 
                 'Triglyceride Level', 'Fasting Blood Sugar', 'CRP Level', 'Homocysteine Level']

categorical_cols = ['Gender', 'Exercise Habits', 'Smoking', 'Family Heart Disease', 
                   'Diabetes', 'High Blood Pressure', 'Low HDL Cholesterol', 
                   'High LDL Cholesterol', 'Alcohol Consumption', 'Stress Level', 
                   'Sugar Consumption']

target_col = 'Heart Disease Status'

# Impute numerical columns
num_imputer = SimpleImputer(strategy='median')
data[numerical_cols] = pd.DataFrame(
    num_imputer.fit_transform(data[numerical_cols]),
    columns=numerical_cols,
    index=data.index
)

# Impute categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = pd.DataFrame(
    cat_imputer.fit_transform(data[categorical_cols]),
    columns=categorical_cols,
    index=data.index
)

# Impute target column if it has missing values
if data[target_col].isna().any():
    target_imputer = SimpleImputer(strategy='most_frequent')
    data[target_col] = target_imputer.fit_transform(data[target_col].values.reshape(-1, 1))

# Verify no missing values remain
print("Missing values after imputation:")
print(data.isna().sum())

# Prepare features and target variable
X = data.drop(target_col, axis=1)
y = data[target_col]

# Encode the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
print(f"Encoded classes: {label_encoder.classes_}")

# Create preprocessing steps for scaling and encoding
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Create the model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', MLPClassifier(hidden_layer_sizes=(100, 50), 
                               activation='relu',
                               alpha=0.0001,
                               learning_rate_init=0.001,
                               learning_rate='adaptive',
                               max_iter=1000, 
                               early_stopping=True,
                               random_state=42))
])

# Train the model
print("Training the MLP model...")
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Decode predictions back to original labels for reporting
y_test_original = label_encoder.inverse_transform(y_test)
y_pred_original = label_encoder.inverse_transform(y_pred)

# Evaluate the model
print("\nModel Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test_original, y_pred_original))

# Perform a simpler hyperparameter tuning to find a better model
param_grid = {
    'classifier__hidden_layer_sizes': [(50,), (100,), (100, 50)],
    'classifier__activation': ['relu', 'tanh'],
    'classifier__alpha': [0.0001, 0.001]
}

print("\nPerforming hyperparameter tuning...")
grid_search = GridSearchCV(model, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)

print("\nBest parameters found by grid search:")
print(grid_search.best_params_)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions with the best model
best_y_pred = best_model.predict(X_test)
best_y_pred_original = label_encoder.inverse_transform(best_y_pred)

# Evaluate the best model
print("\nBest Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, best_y_pred):.4f}")
print("\nClassification Report:")
print(classification_report(y_test_original, best_y_pred_original))

Missing values after imputation:
Age                     0
Gender                  0
Blood Pressure          0
Cholesterol Level       0
Exercise Habits         0
Smoking                 0
Family Heart Disease    0
Diabetes                0
BMI                     0
High Blood Pressure     0
Low HDL Cholesterol     0
High LDL Cholesterol    0
Alcohol Consumption     0
Stress Level            0
Sleep Hours             0
Sugar Consumption       0
Triglyceride Level      0
Fasting Blood Sugar     0
CRP Level               0
Homocysteine Level      0
Heart Disease Status    0
dtype: int64
Encoded classes: ['No' 'Yes']
Training the MLP model...

Model Performance:
Accuracy: 0.8065

Classification Report:
              precision    recall  f1-score   support

          No       0.81      1.00      0.89      1613
         Yes       0.00      0.00      0.00       387

    accuracy                           0.81      2000
   macro avg       0.40      0.50      0.45      2000
weighted avg       

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
import pandas as pd
from sklearn.impute import SimpleImputer

# Load data
data = pd.read_csv("heart_disease.csv")

# Define columns
numerical_cols = ['Age', 'Blood Pressure', 'Cholesterol Level', 'BMI', 'Sleep Hours', 
                 'Triglyceride Level', 'Fasting Blood Sugar', 'CRP Level', 'Homocysteine Level']

categorical_cols = ['Gender', 'Exercise Habits', 'Smoking', 'Family Heart Disease', 
                   'Diabetes', 'High Blood Pressure', 'Low HDL Cholesterol', 
                   'High LDL Cholesterol', 'Alcohol Consumption', 'Stress Level', 
                   'Sugar Consumption']

target_col = 'Heart Disease Status'

# Impute numerical columns
num_imputer = SimpleImputer(strategy='median')
data[numerical_cols] = pd.DataFrame(
    num_imputer.fit_transform(data[numerical_cols]),
    columns=numerical_cols,
    index=data.index
)

# Impute categorical columns
cat_imputer = SimpleImputer(strategy='most_frequent')
data[categorical_cols] = pd.DataFrame(
    cat_imputer.fit_transform(data[categorical_cols]),
    columns=categorical_cols,
    index=data.index
)

# Impute target column if missing values exist
if data[target_col].isna().any():
    target_imputer = SimpleImputer(strategy='most_frequent')
    data[target_col] = target_imputer.fit_transform(data[target_col].values.reshape(-1, 1))

# Save the cleaned data to a new CSV file
data.to_csv("processed_heart_disease.csv", index=False)

# Verify no missing values remain
print("Missing values after imputation:")
print(data.isna().sum())
print("Processed data saved as 'processed_heart_disease.csv'.")