In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import StandardScaler

# Load your dataset into a pandas DataFrame
# Assuming your dataset is stored in a CSV file called 'heart_data.csv'
data = pd.read_csv(r'C:\Users\Admin\Desktop\XAI HEART ATTACK PREDICTION\heart.csv')

# Assuming your target variable is 'HeartDisease'
X = data.drop(columns=['HeartDisease'])  # Features
y = data['HeartDisease']  # Target variable

# Convert categorical variables into numerical values using one-hot encoding
X = pd.get_dummies(X, columns=['Sex', 'ChestPainType', 'FastingBS', 'RestingECG', 'ExerciseAngina', 'ST_Slope'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features by removing the mean and scaling to unit variance
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a Random Forest Classifier
clf = RandomForestClassifier(random_state=42)

# Define hyperparameters to tune
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search to find the best hyperparameters
grid_search = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid_search.fit(X_train_scaled, y_train)

# Get the best parameters and use them to train the model
best_params = grid_search.best_params_
clf = RandomForestClassifier(random_state=42, **best_params)
clf.fit(X_train_scaled, y_train)

# Make predictions on the test data
predictions = clf.predict(X_test_scaled)

# Generate a classification report
class_report = classification_report(y_test, predictions)
print("Classification Report:\n", class_report)
print("Best Hyperparameters:", best_params)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)


In [4]:
print("Shape of X_train_scaled:", X_train_scaled.shape)
print("Shape of X_test_scaled:", X_test_scaled.shape)

# Verify feature names consistency
print("Training Features:", X_train.columns.tolist())
print("Test Features:", X_test.columns.tolist())

# Verify model consistency (ensure clf is the correct model instance)
print("Model Details:", clf)

Shape of X_train_scaled: (734, 21)
Shape of X_test_scaled: (184, 21)
Training Features: ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'Sex_F', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA', 'FastingBS_0', 'FastingBS_1', 'RestingECG_LVH', 'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_N', 'ExerciseAngina_Y', 'ST_Slope_Down', 'ST_Slope_Flat', 'ST_Slope_Up']
Test Features: ['Age', 'RestingBP', 'Cholesterol', 'MaxHR', 'Oldpeak', 'Sex_F', 'Sex_M', 'ChestPainType_ASY', 'ChestPainType_ATA', 'ChestPainType_NAP', 'ChestPainType_TA', 'FastingBS_0', 'FastingBS_1', 'RestingECG_LVH', 'RestingECG_Normal', 'RestingECG_ST', 'ExerciseAngina_N', 'ExerciseAngina_Y', 'ST_Slope_Down', 'ST_Slope_Flat', 'ST_Slope_Up']
Model Details: RandomForestClassifier(min_samples_split=5, random_state=42)


In [5]:
# Check model predictions for the test data
model_predictions = clf.predict(X_test_scaled)
print("Model Predictions:", model_predictions)


Model Predictions: [0 1 1 1 0 1 1 0 1 1 1 0 1 0 1 1 0 1 1 0 1 1 0 0 1 1 0 1 0 0 1 0 1 1 1 0 1
 0 1 1 1 1 1 0 0 1 1 1 1 1 0 0 1 1 0 0 0 1 1 1 0 0 0 1 0 1 1 1 1 1 0 0 1 1
 0 1 0 1 0 1 0 1 1 0 1 1 0 1 0 0 0 1 1 0 1 0 1 0 1 1 0 1 1 0 1 1 0 1 1 0 1
 1 0 0 1 0 1 0 1 1 1 1 1 1 0 0 0 0 0 0 0 1 1 0 1 0 1 1 0 1 0 0 1 1 1 1 1 0
 0 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 1 1 1 1 0 0 1 1 0 1 0 1 1 1 0 1 1 1 0 1]


In [None]:
import shap
import pandas as pd
from sklearn.ensemble import RandomForestClassifier

# Assuming you have your test data in X_test and the trained Random Forest model in rf_classifier
# Initialize the SHAP explainer with the trained Random Forest model
explainer = shap.Explainer(rf_classifier, X_test)

# Calculate SHAP values for all the test samples
shap_values = explainer(X_test)

# Create a summary plot to visualize feature importances for a specific instance
shap.summary_plot(shap_values, X_test)


In [1]:
import pandas as pd

# Data
data = {
    'Model': ['Random Forest', 'MLP', 'Decision Tree', 'KNN'],
    'Accuracy': ['87.5%', '85.3%', '85.3%', '70.7%'],
    'Precision': ['85%', '79%', '84%', '63%'],
    'Recall': ['89%', '83%', '89%', '70%'],
    'F1-Score': ['87%', '81%', '87%', '67%']
}

# Create DataFrame
model_comparison = pd.DataFrame(data)

# Print table
print(model_comparison)


           Model Accuracy Precision Recall F1-Score
0  Random Forest    87.5%       85%    89%      87%
1            MLP    85.3%       79%    83%      81%
2  Decision Tree    85.3%       84%    89%      87%
3            KNN    70.7%       63%    70%      67%
