In [13]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv("/Users/zhangxijing/MasterNEU/INFO6105DataScienceEngineeringMethodsandTools/Dataset/Heart_Failure.csv", index_col=0)

df = pd.get_dummies(df, columns=['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope'], dtype='int')

# print(df)

# Define features and label
X = df.drop('HeartDisease', axis=1)  # Assuming 'development_status' is the label
y = df['HeartDisease']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Forward feature selection with SFS to select the best three features
knn = KNeighborsClassifier(n_neighbors=5)
sfs = SFS(knn, k_features=3, forward=True, scoring='accuracy', cv=5)
sfs.fit(X_scaled, y)

selected_features_names = list(sfs.k_feature_names_)
print("Selected features by SFS:", selected_features_names)

# PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# Explaining each PC
print("PCA components:\n", pca.components_)
print("Explained variance ratio:", pca.explained_variance_ratio_)

# LDA
# lda = LDA(n_components=2)  # As we are aiming for 2 components
# X_lda = lda.fit_transform(X_scaled, y)

# Accuracy comparison

# Original selected features by SFS
X_selected_sfs = X_scaled[:, sfs.k_feature_idx_]
knn.fit(X_selected_sfs, y)
y_pred_sfs = knn.predict(X_selected_sfs)
accuracy_sfs = accuracy_score(y, y_pred_sfs)
print("Accuracy with SFS selected features:", accuracy_sfs)

# PCA
knn.fit(X_pca, y)
y_pred_pca = knn.predict(X_pca)
accuracy_pca = accuracy_score(y, y_pred_pca)
print("Accuracy with PCA:", accuracy_pca)

# LDA
# knn.fit(X_lda, y)
# y_pred_lda = knn.predict(X_lda)
# accuracy_lda = accuracy_score(y, y_pred_lda)
# print("Accuracy with LDA:", accuracy_lda)

Selected features by SFS: ['8', '10', '18']
PCA components:
 [[-0.09324274  0.09940585 -0.11635291  0.27977789 -0.26795968  0.20624516
  -0.20624516 -0.32517407  0.25811419  0.12368352  0.04485685 -0.00157574
   0.08268335 -0.10079213  0.37980991 -0.37980991 -0.10003786 -0.30914969
   0.36328074]
 [ 0.1635159   0.31953927 -0.11116907  0.07894405  0.19170496  0.52077719
  -0.52077719  0.00418757 -0.03668683  0.00527595  0.04615999  0.33978646
  -0.30188516  0.0270415  -0.11309998  0.11309998  0.01365842  0.13142136
  -0.13969187]
 [ 0.0812404  -0.0730512   0.17997803  0.07845403  0.01805407 -0.24653121
   0.24653121 -0.10175836 -0.08003093  0.10310219  0.17978285  0.38845338
  -0.6217868   0.37355273  0.1406656  -0.1406656   0.13149367 -0.15898776
   0.09341153]]
Explained variance ratio: [0.22644516 0.10490441 0.09331312]
Accuracy with SFS selected features: 0.8137254901960784
Accuracy with PCA: 0.8801742919389978
