In [4]:
from google.colab import files
heart = files.upload()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy import stats
import seaborn as sns
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA

# 1. Load Data
df1 = pd.read_csv("heart (1).csv")
print(df1.head())

# 2. Encode any object-dtype columns
text_cols = df1.select_dtypes(include=['object']).columns
label_encoder = LabelEncoder()

for col in text_cols:
    df1[col] = label_encoder.fit_transform(df1[col])

print(df1.head())

# 3. Split features/target
X = df1.drop('HeartDisease', axis=1)
y = df1['HeartDisease']

# 4. Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 5. Scale
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# 6. SVM
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)
svm_predictions = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
print(f"SVM Accuracy: {svm_accuracy:.4f}")

# 7. Logistic Regression
lr_model = LogisticRegression(random_state=42, max_iter=200)
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict(X_test)
lr_accuracy = accuracy_score(y_test, lr_predictions)
print(f"Logistic Regression Accuracy: {lr_accuracy:.4f}")

# 8. Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
print(f"Random Forest Accuracy: {rf_accuracy:.4f}")

# 9. Compare
models = {
    "SVM": svm_accuracy,
    "Logistic Regression": lr_accuracy,
    "Random Forest": rf_accuracy
}
best_model = max(models, key=models.get)
print(f"\nBest Model: {best_model} with accuracy {models[best_model]:.4f}")

# 10. PCA (95% variance)
pca = PCA(n_components=0.95, random_state=42)
X_train_pca = pca.fit_transform(X_train)
X_test_pca  = pca.transform(X_test)

# 11. Re-evaluate under PCA
svm_pca = SVC(kernel='linear', random_state=42)
svm_pca.fit(X_train_pca, y_train)
svm_acc_pca = accuracy_score(y_test, svm_pca.predict(X_test_pca))
print(f"SVM Accuracy (with PCA): {svm_acc_pca:.4f}")

lr_pca = LogisticRegression(random_state=42, max_iter=200)
lr_pca.fit(X_train_pca, y_train)
lr_acc_pca = accuracy_score(y_test, lr_pca.predict(X_test_pca))
print(f"Logistic Regression Accuracy (with PCA): {lr_acc_pca:.4f}")

rf_pca = RandomForestClassifier(random_state=42)
rf_pca.fit(X_train_pca, y_train)
rf_acc_pca = accuracy_score(y_test, rf_pca.predict(X_test_pca))
print(f"Random Forest Accuracy (with PCA): {rf_acc_pca:.4f}")

models_pca = {
    "SVM": svm_acc_pca,
    "Logistic Regression": lr_acc_pca,
    "Random Forest": rf_acc_pca
}
best_pca = max(models_pca, key=models_pca.get)
print(f"\nBest Model (with PCA): {best_pca} with accuracy {models_pca[best_pca]:.4f}")


Saving heart.csv to heart (1).csv
   Age Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
0   40   M           ATA        140          289          0     Normal    172   
1   49   F           NAP        160          180          0     Normal    156   
2   37   M           ATA        130          283          0         ST     98   
3   48   F           ASY        138          214          0     Normal    108   
4   54   M           NAP        150          195          0     Normal    122   

  ExerciseAngina  Oldpeak ST_Slope  HeartDisease  
0              N      0.0       Up             0  
1              N      1.0     Flat             1  
2              N      0.0       Up             0  
3              Y      1.5     Flat             1  
4              N      0.0       Up             0  
   Age  Sex  ChestPainType  RestingBP  Cholesterol  FastingBS  RestingECG  \
0   40    1              1        140          289          0           1   
1   49    0        