In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Load the dataset
df = pd.read_csv('/content/heart.csv')

# Separate features and target
X = df.drop('HeartDisease', axis=1)
y = df['HeartDisease']

# Identify categorical and numerical columns
categorical_cols = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
numerical_cols = ['Age', 'RestingBP', 'Cholesterol', 'FastingBS', 'MaxHR', 'Oldpeak']

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(), categorical_cols)
    ])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "Random Forest": RandomForestClassifier(random_state=42),
    "SVM": SVC(random_state=42)
}

# Function to evaluate models
def evaluate_models(X_train, X_test, y_train, y_test, preprocessor, models):
    results = {}
    for name, model in models.items():
        # Create pipeline
        pipeline = Pipeline(steps=[
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])

        # Train model
        pipeline.fit(X_train, y_train)

        # Make predictions
        y_pred = pipeline.predict(X_test)

        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        results[name] = accuracy

        print(f"{name} Accuracy: {accuracy:.4f}")

    return results

# Evaluate models without PCA
print("Model Performance Without PCA:")
original_results = evaluate_models(X_train, X_test, y_train, y_test, preprocessor, models)

# Now with PCA
print("\nModel Performance With PCA:")

# Update preprocessor to include PCA
pca_preprocessor = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95))  # Keep 95% of variance
])

pca_results = evaluate_models(X_train, X_test, y_train, y_test, pca_preprocessor, models)

# Compare results
print("\nComparison of Results:")
print("{:<20} {:<15} {:<15}".format('Model', 'Original Acc', 'PCA Acc'))
for model in models.keys():
    print("{:<20} {:<15.4f} {:<15.4f}".format(
        model,
        original_results[model],
        pca_results[model]
    ))

# Optional: Analyze PCA components
full_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('pca', PCA())
])

full_pipeline.fit(X_train)
pca = full_pipeline.named_steps['pca']

# Print explained variance
print("\nPCA Explained Variance Ratio:")
for i, ratio in enumerate(pca.explained_variance_ratio_):
    print(f"Component {i+1}: {ratio:.4f}")

print(f"\nTotal Variance Explained by all components: {sum(pca.explained_variance_ratio_):.4f}")

# Determine optimal number of components
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
print("\nCumulative Explained Variance:")
for i, variance in enumerate(cumulative_variance):
    print(f"Components {i+1}: {variance:.4f}")

Model Performance Without PCA:
Logistic Regression Accuracy: 0.8533
Random Forest Accuracy: 0.8804
SVM Accuracy: 0.8641

Model Performance With PCA:
Logistic Regression Accuracy: 0.8533
Random Forest Accuracy: 0.8533
SVM Accuracy: 0.8587

Comparison of Results:
Model                Original Acc    PCA Acc        
Logistic Regression  0.8533          0.8533         
Random Forest        0.8804          0.8533         
SVM                  0.8641          0.8587         

PCA Explained Variance Ratio:
Component 1: 0.2614
Component 2: 0.1528
Component 3: 0.1110
Component 4: 0.0948
Component 5: 0.0787
Component 6: 0.0708
Component 7: 0.0421
Component 8: 0.0372
Component 9: 0.0364
Component 10: 0.0323
Component 11: 0.0276
Component 12: 0.0214
Component 13: 0.0187
Component 14: 0.0088
Component 15: 0.0061
Component 16: 0.0000
Component 17: 0.0000
Component 18: 0.0000
Component 19: 0.0000
Component 20: 0.0000

Total Variance Explained by all components: 1.0000

Cumulative Explained Variance:
