In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Step 1: Load the dataset
data = pd.read_csv("/content/heart_disease_data.csv")
print(data.head())
print("\nMissing values:\n", data.isnull().sum())

# Step 2: Separate features and target
X = data.drop('target', axis=1)
y = data['target']

# Step 3: Standardization
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Train-test split (Before PCA)
x_train, x_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.2, random_state=42
)

# Step 5: Train models (Before PCA)
model1 = LogisticRegression()
model2 = DecisionTreeClassifier()
model3 = RandomForestClassifier()

models = [model1, model2, model3]

for model in models:
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"{model.__class__.__name__} Accuracy (Before PCA): {accuracy:.4f}")

# Step 6: Apply PCA
pca = PCA(n_components=5)
X_pca = pca.fit_transform(X_scaled)

# Step 7: Train-test split (After PCA)
X_train, X_test, Y_train, Y_test = train_test_split(
    X_pca, y, test_size=0.2, random_state=42
)

# Step 8: Train models (After PCA)
for model in models:
    model.fit(X_train, Y_train)
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    print(f"{model.__class__.__name__} Accuracy (After PCA): {accuracy:.4f}")


   age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  slope  \
0   63    1   3       145   233    1        0      150      0      2.3      0   
1   37    1   2       130   250    0        1      187      0      3.5      0   
2   41    0   1       130   204    0        0      172      0      1.4      2   
3   56    1   1       120   236    0        1      178      0      0.8      2   
4   57    0   0       120   354    0        1      163      1      0.6      2   

   ca  thal  target  
0   0     1       1  
1   0     2       1  
2   0     2       1  
3   0     2       1  
4   0     2       1  

Missing values:
 age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64
LogisticRegression Accuracy (Before PCA): 0.8525
DecisionTreeClassifier Accuracy (Before PCA): 0.8361
RandomForestClassifier Accuracy (Before PCA): 0.8525