In [9]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Column names according to dataset documentation
column_names = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
                'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

# Load data
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data'
df = pd.read_csv(url, names=column_names)

# Replace missing value placeholders '?' with NaN
df.replace('?', np.nan, inplace=True)

# Convert columns with missing values to numeric
for col in ['ca', 'thal']:
    df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with missing values
df.dropna(inplace=True)

# The target is 0 (no disease) and 1-4 (presence of disease), convert >1 to 1
df['target'] = df['target'].apply(lambda x: 1 if x > 0 else 0)

# Identify numerical and categorical features
categorical_features = ['cp', 'restecg', 'slope', 'thal', 'ca']
numerical_features = ['age', 'sex', 'trestbps', 'chol', 'fbs', 'thalach', 'exang', 'oldpeak']

# Create a preprocessor to apply different transformations
# StandardScaler is for numerical data, passthrough for categorical
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', 'passthrough', categorical_features)
    ],
    remainder='passthrough'
)

# Create a pipeline to streamline the workflow
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=500))
])

# Define a grid of hyperparameters to search
param_grid = {
    'classifier__C': [0.001, 0.01, 0.1, 1, 10, 100],  # Regularization parameter
    'classifier__solver': ['liblinear', 'lbfgs']     # Solver for optimization
}

# Set up cross-validation using StratifiedKFold
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Perform a grid search to find the best model
grid_search = GridSearchCV(pipeline, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)

# Split features and target
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the best model from the grid search on the training data
best_model = grid_search.fit(X_train, y_train).best_estimator_

# Make predictions and evaluate the final model on the test set
y_pred = best_model.predict(X_test)

print("Best parameters found: ", grid_search.best_params_)
print(f"Accuracy on test set: {accuracy_score(y_test, y_pred):.4f}")
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Best parameters found:  {'classifier__C': 1, 'classifier__solver': 'lbfgs'}
Accuracy on test set: 0.8833
Confusion Matrix:
[[32  4]
 [ 3 21]]
Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.89      0.90        36
           1       0.84      0.88      0.86        24

    accuracy                           0.88        60
   macro avg       0.88      0.88      0.88        60
weighted avg       0.88      0.88      0.88        60

