In [1]:
import numpy as np
import  pandas as pd
from sklearn.neighbors import NearestNeighbors

In [2]:
def fsfs_feature_selection(X, k):
    """
    FSFS (Feature Selection using Feature Similarity) algorithm implementation for discrete features.

    Parameters:
        X (numpy array): The input feature matrix, where each row represents a sample and each column represents a feature.
        k (int): The number of nearest neighbors to consider for feature similarity.

    Returns:
        selected_features (list): List of indices of the selected features after feature selection.
    """

    # Step 1: Calculate feature similarity and cluster the features
    n_features = X.shape[1]
    similarity_matrix = np.zeros((n_features, n_features))

    # Calculate feature similarity using Hamming distance
    for i in range(n_features):
        for j in range(n_features):
            if i != j:
                similarity_matrix[i, j] = np.sum(X[:, i] != X[:, j]) / X.shape[0]

    # Step 2: Calculate k-NN features and select the most compact subset
    selected_features = []
    for i in range(n_features):
        # Calculate k-NN for the current feature
        k_neighbors = np.argsort(similarity_matrix[i])[:k]

        # Check if the current feature is the most compact within its k-NN features
        compactness = np.sum(similarity_matrix[i, k_neighbors]) / k
        is_compact = True
        for neighbor in k_neighbors:
            neighbor_k_neighbors = np.argsort(similarity_matrix[neighbor])[:k]
            neighbor_compactness = np.sum(similarity_matrix[neighbor, neighbor_k_neighbors]) / k
            if neighbor_compactness < compactness:
                is_compact = False
                break

        if is_compact:
            selected_features.append(i)

    return selected_features

In [4]:
data = pd.read_excel(r"all_samples_clean.xlsx", sheet_name="Sheet1")
data = data.drop(['BASIC_entity_name', 'BASIC_year'], axis=1)

In [5]:
X =data.values

In [9]:
k_neighbors = 3
selected_features = fsfs_feature_selection(X, k_neighbors)

In [10]:
selected_features

[1]