In [6]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv("/Users/zhangxijing/MasterNEU/INFO6105DataScienceEngineeringMethodsandTools/Dataset/Heart_Failure.csv", index_col=0)

df = pd.get_dummies(df, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='int')

print(df)

# Define features and label
X = df.drop('HeartDisease', axis=1)  # Assuming 'development_status' is the label
y = df['HeartDisease']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Forward feature selection with SFS to select the best three features
knn = KNeighborsClassifier(n_neighbors=5)
sfs = SFS(knn, k_features=3, forward=True, scoring='accuracy', cv=5)
sfs.fit(X_scaled, y)

selected_features_names = list(sfs.k_feature_names_)
print("Selected features by SFS:", selected_features_names)

# PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# Explaining each PC
print("PCA components:\n", pca.components_)
print("Explained variance ratio:", pca.explained_variance_ratio_)

# LDA
lda = LDA(n_components=2)  # As we are aiming for 2 components
X_lda = lda.fit_transform(X_scaled, y)

# Accuracy comparison

# Original selected features by SFS
X_selected_sfs = X_scaled[:, sfs.k_feature_idx_]
knn.fit(X_selected_sfs, y)
y_pred_sfs = knn.predict(X_selected_sfs)
accuracy_sfs = accuracy_score(y, y_pred_sfs)
print("Accuracy with SFS selected features:", accuracy_sfs)

# PCA
knn.fit(X_pca, y)
y_pred_pca = knn.predict(X_pca)
accuracy_pca = accuracy_score(y, y_pred_pca)
print("Accuracy with PCA:", accuracy_pca)

# LDA
knn.fit(X_lda, y)
y_pred_lda = knn.predict(X_lda)
accuracy_lda = accuracy_score(y, y_pred_lda)
print("Accuracy with LDA:", accuracy_lda)

    Sex ChestPainType  RestingBP  Cholesterol  FastingBS RestingECG  MaxHR  \
Age                                                                          
40    M           ATA        140          289          0     Normal    172   
49    F           NAP        160          180          0     Normal    156   
37    M           ATA        130          283          0         ST     98   
48    F           ASY        138          214          0     Normal    108   
54    M           NAP        150          195          0     Normal    122   
..   ..           ...        ...          ...        ...        ...    ...   
45    M            TA        110          264          0     Normal    132   
68    M           ASY        144          193          1     Normal    141   
57    M           ASY        130          131          0     Normal    115   
57    F           ATA        130          236          0        LVH    174   
38    M           NAP        138          175          0     Nor

ValueError: could not convert string to float: 'M'