**1) Data Collection and Preparation:**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [None]:
# Load the dataset
df1 = 'Heart_Attack_Data.csv'
data = pd.read_csv(df1)

In [None]:
# Display the first few rows of the dataset
print(data.head())

   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  


In [None]:
# Check for missing values
print(data.isnull().sum())

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64


In [None]:
# Fill missing values or drop rows/columns with missing values
# For simplicity, let's drop rows with missing values
data.dropna(inplace=True)


In [None]:
# Encode categorical variables
categorical_features = ['cp', 'restecg', 'slope', 'thal']
one_hot_encoder = OneHotEncoder()

In [None]:
# Create a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'trestbps', 'chol', 'thalach', 'oldpeak']),
        ('cat', one_hot_encoder, categorical_features)
    ]
)


In [None]:
# Split the dataset into features and target variable
X = data.drop('target', axis=1)
y = data['target']

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

print(X_train.shape, X_test.shape)

(241, 19) (61, 19)


**2) Model Training and Evaluation:**

In [None]:
# Initialize the models
log_reg = LogisticRegression()
rf_clf = RandomForestClassifier()

In [None]:
# Train the models
log_reg.fit(X_train, y_train)
rf_clf.fit(X_train, y_train)

In [None]:
# Make predictions
log_reg_pred = log_reg.predict(X_test)
rf_clf_pred = rf_clf.predict(X_test)


In [None]:
# Evaluate the models
def evaluate_model(y_test, predictions):
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    f1 = f1_score(y_test, predictions)
    roc_auc = roc_auc_score(y_test, predictions)
    return accuracy, precision, recall, f1, roc_auc

log_reg_metrics = evaluate_model(y_test, log_reg_pred)
rf_clf_metrics = evaluate_model(y_test, rf_clf_pred)

print(f"Logistic Regression - Accuracy: {log_reg_metrics[0]:.2f}, Precision: {log_reg_metrics[1]:.2f}, Recall: {log_reg_metrics[2]:.2f}, F1 Score: {log_reg_metrics[3]:.2f}, ROC AUC: {log_reg_metrics[4]:.2f}")
print(f"Random Forest - Accuracy: {rf_clf_metrics[0]:.2f}, Precision: {rf_clf_metrics[1]:.2f}, Recall: {rf_clf_metrics[2]:.2f}, F1 Score: {rf_clf_metrics[3]:.2f}, ROC AUC: {rf_clf_metrics[4]:.2f}")

Logistic Regression - Accuracy: 0.84, Precision: 0.87, Recall: 0.81, F1 Score: 0.84, ROC AUC: 0.84
Random Forest - Accuracy: 0.90, Precision: 0.93, Recall: 0.88, F1 Score: 0.90, ROC AUC: 0.90


**3) Model Tuning:**

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

In [None]:
# Grid search for Random Forest
grid_search_rf = GridSearchCV(estimator=rf_clf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train, y_train)


Fitting 5 folds for each of 108 candidates, totalling 540 fits


In [None]:
# Best parameters and model
best_params_rf = grid_search_rf.best_params_
best_rf_clf = grid_search_rf.best_estimator_

In [None]:
# Evaluate the best model
best_rf_clf_pred = best_rf_clf.predict(X_test)
best_rf_clf_metrics = evaluate_model(y_test, best_rf_clf_pred)

print(f"Best Random Forest - Accuracy: {best_rf_clf_metrics[0]:.2f}, Precision: {best_rf_clf_metrics[1]:.2f}, Recall: {best_rf_clf_metrics[2]:.2f}, F1 Score: {best_rf_clf_metrics[3]:.2f}, ROC AUC: {best_rf_clf_metrics[4]:.2f}")
print(f"Best Parameters: {best_params_rf}")

Best Random Forest - Accuracy: 0.82, Precision: 0.86, Recall: 0.78, F1 Score: 0.82, ROC AUC: 0.82
Best Parameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 50}


**4) User Input for Prediction:**

In [None]:
# Function to get user input and predict heart attack risk
def predict_heart_attack(model, preprocessor):
    print("Enter the following details to predict heart attack risk:")
    age = float(input("Age: "))
    sex = int(input("Sex (1 = male, 0 = female): "))
    cp = int(input("Chest pain type (0-3): "))
    trestbps = float(input("Resting blood pressure (in mm Hg): "))
    chol = float(input("Serum cholesterol (in mg/dl): "))
    fbs = int(input("Fasting blood sugar (1 = > 120 mg/dl, 0 = otherwise): "))
    restecg = int(input("Resting electrocardiographic results (0-2): "))
    thalach = float(input("Maximum heart rate achieved: "))
    exang = int(input("Exercise-induced angina (1 = yes, 0 = no): "))
    oldpeak = float(input("ST depression induced by exercise relative to rest: "))
    slope = int(input("Slope of the peak exercise ST segment (0 = upsloping, 1 = flat, 2 = downsloping): "))
    ca = int(input("Number of major vessels (0-3) colored by fluoroscopy: "))
    thal = int(input("Thalassemia (3 = normal, 6 = fixed defect, 7 = reversible defect): "))

    # Create a dataframe with the input data
    input_data = pd.DataFrame({
        'age': [age], 'sex': [sex], 'cp': [cp], 'trestbps': [trestbps], 'chol': [chol],
        'fbs': [fbs], 'restecg': [restecg], 'thalach': [thalach], 'exang': [exang],
        'oldpeak': [oldpeak], 'slope': [slope], 'ca': [ca], 'thal': [thal]
    })

    # Preprocess the input data
    input_data_processed = preprocessor.transform(input_data)

    # Predict the risk
    risk_prediction = model.predict(input_data_processed)
    risk_probability = model.predict_proba(input_data_processed)[:, 1]

    print(f"Predicted Heart Attack Risk: {'Yes' if risk_prediction[0] == 1 else 'No'}")
    print(f"Risk Probability: {risk_probability[0]:.2f}")

# Example usage with the best Random Forest model
predict_heart_attack(best_rf_clf, preprocessor)


Enter the following details to predict heart attack risk:
Age: 33
Sex (1 = male, 0 = female): 0
Chest pain type (0-3): 1
Resting blood pressure (in mm Hg): 123
Serum cholesterol (in mg/dl): 221
Fasting blood sugar (1 = > 120 mg/dl, 0 = otherwise): 0
Resting electrocardiographic results (0-2): 1
Maximum heart rate achieved: 165
Exercise-induced angina (1 = yes, 0 = no): 0
ST depression induced by exercise relative to rest: 1.2
Slope of the peak exercise ST segment (0 = upsloping, 1 = flat, 2 = downsloping): 0
Number of major vessels (0-3) colored by fluoroscopy: 1
Thalassemia (3 = normal, 6 = fixed defect, 7 = reversible defect): 1
Predicted Heart Attack Risk: Yes
Risk Probability: 0.64
