In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


df = pd.read_csv("heart (1).csv")


categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)


X = df.drop("HeartDisease", axis=1)
y = df["HeartDisease"]


scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

models = {
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier()
}

accuracy_before_pca = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy_before_pca[name] = accuracy_score(y_test, y_pred)

pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)

X_train_pca, X_test_pca, y_train_pca, y_test_pca = train_test_split(X_pca, y, test_size=0.2, random_state=42)

accuracy_after_pca = {}
for name, model in models.items():
    model.fit(X_train_pca, y_train_pca)
    y_pred_pca = model.predict(X_test_pca)
    accuracy_after_pca[name] = accuracy_score(y_test_pca, y_pred_pca)

print("🔍 Accuracy BEFORE PCA:")
for name, acc in accuracy_before_pca.items():
    print(f"{name}: {acc:.4f}")

print("\n📉 Accuracy AFTER PCA:")
for name, acc in accuracy_after_pca.items():
    print(f"{name}: {acc:.4f}")

print(f"\nOriginal features: {X.shape[1]}")
print(f"Features after PCA: {X_pca.shape[1]}")


🔍 Accuracy BEFORE PCA:
SVM: 0.8750
Logistic Regression: 0.8533
Random Forest: 0.8641

📉 Accuracy AFTER PCA:
SVM: 0.8750
Logistic Regression: 0.8533
Random Forest: 0.8587

Original features: 15
Features after PCA: 13
