In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv("/Users/zhangxijing/MasterNEU/INFO6105DataScienceEngineeringMethodsandTools/Dataset/Heart_Failure.csv", index_col=0)

df = pd.get_dummies(df, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='int')

# print(df)

# Define features and label
X = df.drop('HeartDisease', axis=1)  # Assuming 'development_status' is the label
y = df['HeartDisease']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Forward feature selection with SFS to select the best three features
knn = KNeighborsClassifier(n_neighbors=5)
sfs = SFS(knn, k_features=3, forward=True, scoring='accuracy', cv=5)
sfs.fit(X_scaled, y)

selected_features_names = list(sfs.k_feature_names_)
print("Selected features by SFS:", selected_features_names)

# PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# Explaining each PC
print("PCA components:\n", pca.components_)
print("Explained variance ratio:", pca.explained_variance_ratio_)

# LDA
# lda = LDA(n_components=2)  # As we are aiming for 2 components
# X_lda = lda.fit_transform(X_scaled, y)

# Accuracy comparison

# Original selected features by SFS
X_selected_sfs = X_scaled[:, sfs.k_feature_idx_]
knn.fit(X_selected_sfs, y)
y_pred_sfs = knn.predict(X_selected_sfs)
accuracy_sfs = accuracy_score(y, y_pred_sfs)
print("Accuracy with SFS selected features:", accuracy_sfs)

# PCA
knn.fit(X_pca, y)
y_pred_pca = knn.predict(X_pca)
accuracy_pca = accuracy_score(y, y_pred_pca)
print("Accuracy with PCA:", accuracy_pca)

# LDA
# knn.fit(X_lda, y)
# y_pred_lda = knn.predict(X_lda)
# accuracy_lda = accuracy_score(y, y_pred_lda)
# print("Accuracy with LDA:", accuracy_lda)

Selected features by SFS: ['8', '10', '18']
PCA components:
 [[-0.09324412  0.09939916 -0.11635086  0.27978011 -0.26795242  0.20624598
  -0.20624598 -0.32517502  0.25811594  0.12368314  0.04485663 -0.00157466
   0.08268319 -0.10079303  0.37980878 -0.37980878 -0.10004217 -0.30915059
   0.36328386]
 [ 0.1639544   0.3183473  -0.11172457  0.08068066  0.19107343  0.52084114
  -0.52084114  0.00448619 -0.03674938  0.00503757  0.0460434   0.33972863
  -0.30200692  0.02725133 -0.11324497  0.11324497  0.01400912  0.131713
  -0.14016547]
 [ 0.08120647 -0.07013973  0.17983085  0.0762649   0.0170955  -0.2470701
   0.2470701  -0.10192139 -0.08040087  0.10353436  0.17999627  0.38839085
  -0.62165061  0.37344789  0.14113788 -0.14113788  0.13198307 -0.15899736
   0.09317133]]
Explained variance ratio: [0.22644516 0.10490544 0.09330978]
Accuracy with SFS selected features: 0.8137254901960784
Accuracy with PCA: 0.8812636165577342
