In [24]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from sklearn.neighbors import KNeighborsClassifier
from matplotlib import pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Load data
df = pd.read_csv("/Users/zhangxijing/MasterNEU/INFO6105DataScienceEngineeringMethodsandTools/Dataset/world_ds.csv", index_col=0)

# Define features and label
X = df.drop('development_status', axis=1)  # Assuming 'development_status' is the label
y = df['development_status']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Forward feature selection with SFS to select the best three features
knn = KNeighborsClassifier(n_neighbors=5)
sfs = SFS(knn, k_features=3, forward=True, scoring='accuracy', cv=5)
sfs.fit(X_scaled, y)

selected_features_names = list(sfs.k_feature_names_)
print("Selected features by SFS:", selected_features_names)

# PCA
pca = PCA(n_components=3)
X_pca = pca.fit_transform(X_scaled)

# Explaining each PC
print("PCA components:\n", pca.components_)
print("Explained variance ratio:", pca.explained_variance_ratio_)

# LDA
lda = LDA(n_components=2)  # As we are aiming for 2 components
X_lda = lda.fit_transform(X_scaled, y)

# Accuracy comparison

# Original selected features by SFS
X_selected_sfs = X_scaled[:, sfs.k_feature_idx_]
knn.fit(X_selected_sfs, y)
y_pred_sfs = knn.predict(X_selected_sfs)
accuracy_sfs = accuracy_score(y, y_pred_sfs)
print("Accuracy with SFS selected features:", accuracy_sfs)

# PCA
knn.fit(X_pca, y)
y_pred_pca = knn.predict(X_pca)
accuracy_pca = accuracy_score(y, y_pred_pca)
print("Accuracy with PCA:", accuracy_pca)

# LDA
knn.fit(X_lda, y)
y_pred_lda = knn.predict(X_lda)
accuracy_lda = accuracy_score(y, y_pred_lda)
print("Accuracy with LDA:", accuracy_lda)

Selected features by SFS: ['3', '5', '8']
PCA components:
 [[ 4.02787214e-01 -8.97491488e-02  3.92004483e-01 -1.14429471e-01
  -1.10656218e-01  3.97935371e-01 -4.05388894e-01  3.62036324e-01
  -2.84697121e-01 -3.38682807e-01 -3.43427240e-02 -4.49017163e-04]
 [ 2.53662510e-02  6.01685161e-01  2.73033024e-02 -1.07972610e-01
   5.52985747e-01  5.20818243e-02 -5.24225229e-02  4.50005929e-02
  -4.60862826e-02 -8.68103356e-02  5.49522563e-01  3.74307683e-03]
 [-1.04076160e-01 -2.52269844e-02 -1.80239095e-01 -6.48749814e-01
  -8.47981246e-02 -1.19314555e-01 -1.37715606e-02 -2.22526769e-01
  -4.44212659e-01 -6.52660113e-02 -2.35888555e-02  5.11755628e-01]]
Explained variance ratio: [0.46187799 0.21175161 0.10063814]
Accuracy with SFS selected features: 0.8776595744680851
Accuracy with PCA: 0.8138297872340425
Accuracy with LDA: 0.8617021276595744
