In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA

# Load the dataset
df = pd.read_csv("heart (1).csv")

# Step 1: Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

# Step 2: Label Encoding + One-Hot Encoding
# Label encode binary columns, one-hot encode multi-class categorical
df_encoded = df.copy()
label_encoders = {}
for col in categorical_cols:
    if df_encoded[col].nunique() == 2:
        le = LabelEncoder()
        df_encoded[col] = le.fit_transform(df_encoded[col])
        label_encoders[col] = le
    else:
        df_encoded = pd.get_dummies(df_encoded, columns=[col])

# Step 3: Feature/target split
X = df_encoded.drop(columns=['HeartDisease'])  # Assuming 'target' is the column to predict
y = df_encoded['HeartDisease']

# Step 4: Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 6: Train classifiers and evaluate accuracy
models = {
    "SVM": SVC(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(random_state=42)
}

print("=== Accuracy without PCA ===")
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"{name}: {acc:.4f}")

# Step 7: Apply PCA
pca = PCA(n_components=0.95)  # Keep 95% variance
X_pca = pca.fit_transform(X_scaled)

# Train-test split for PCA-reduced data
X_train_pca, X_test_pca, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

print("\n=== Accuracy with PCA ===")
for name, model in models.items():
    model.fit(X_train_pca, y_train)
    y_pred_pca = model.predict(X_test_pca)
    acc_pca = accuracy_score(y_test, y_pred_pca)
    print(f"{name}: {acc_pca:.4f}")


=== Accuracy without PCA ===
SVM: 0.8804
Logistic Regression: 0.8533
Random Forest: 0.8804

=== Accuracy with PCA ===
SVM: 0.8804
Logistic Regression: 0.8533
Random Forest: 0.8424
