In [1]:
import os
import warnings
import random
import numpy as np
import pandas as pd

from sklearn.feature_selection import chi2, f_classif, mutual_info_classif, RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score

from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from deap import base, creator, tools, algorithms
import mlrose_hiive as mlrose
from pyswarm import pso

# Jika menggunakan library hipotesis untuk ACO
try:
    from pyants import AntColony  # Library hipotetik
except ImportError:
    class AntColony:
        def __init__(self, n_ants, n_iterations, decay, fitness_func):
            self.n_ants = n_ants
            self.n_iterations = n_iterations
            self.decay = decay
            self.fitness_func = fitness_func
        def run(self):
            # Dummy: kembalikan solusi acak
            best_solution = [random.randint(0,1) for _ in range(10)]
            best_fitness = 0
            return best_solution, best_fitness

In [2]:
# Load data
df = pd.read_csv('data/preprocessed_data.csv')
print('Sample Data')
display(df.head())

TARGET = 'cardio'

Sample Data


Unnamed: 0,age,height,weight,systolic,diastolic,bmi,pulse_pressure,map,cholesterol,gluc,gender,smoke,alco,active,cardio
0,50,168,62.0,110,80,21.97,30,90.0,0,0,1,0,0,1,0
1,55,156,85.0,140,90,34.93,50,106.67,2,0,0,0,0,1,1
2,51,165,64.0,130,70,23.51,60,90.0,2,0,0,0,0,0,1
3,48,169,82.0,150,100,28.71,50,116.67,0,0,1,0,0,1,1
4,60,151,67.0,120,80,29.38,40,93.33,1,1,0,0,0,0,0


In [3]:
# ------------------------------------------------------------
# 1. Chi-Square
def chi_square_selection(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    # Ubah fitur kategorikal jika ada
    X_encoded = pd.get_dummies(X)
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    chi2_scores, p_values = chi2(X_encoded, y_encoded)
    chi2_df = pd.DataFrame({
        'Feature': X_encoded.columns, 
        'Chi2 Score': chi2_scores, 
        'p-value': p_values
    })
    selected = chi2_df[chi2_df['p-value'] < 0.05]['Feature'].tolist()
    return selected, X_encoded[selected]

# ------------------------------------------------------------
# 2. ANOVA F-Test
def anova_selection(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    f_scores, p_values = f_classif(X, y_encoded)
    anova_df = pd.DataFrame({
        'Feature': X.columns, 
        'F Score': f_scores, 
        'p-value': p_values
    })
    selected = anova_df[anova_df['p-value'] < 0.05]['Feature'].tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 3. Mutual Information
def mutual_info_selection(df, top_n=10):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    mi_scores = mutual_info_classif(X, y_encoded, discrete_features='auto')
    mi_df = pd.DataFrame({
        'Feature': X.columns, 
        'Mutual Information': mi_scores
    })
    selected = mi_df.sort_values(by='Mutual Information', ascending=False)['Feature'].head(top_n).tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 4. Forward Selection (SFS forward)
def forward_selection(df, k_features=10):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    model = LogisticRegression(max_iter=1000)
    sfs = SFS(model,
              k_features=k_features,
              forward=True,
              floating=False,
              scoring='accuracy',
              cv=5)
    sfs = sfs.fit(X, y)
    selected = list(sfs.k_feature_names_)
    return selected, X[selected]

# ------------------------------------------------------------
# 5. Backward Elimination (SFS backward)
def backward_elimination(df, k_features=10):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    model = LogisticRegression(max_iter=1000)
    sfs = SFS(model,
              k_features=k_features,
              forward=False,
              floating=False,
              scoring='accuracy',
              cv=5)
    sfs = sfs.fit(X, y)
    selected = list(sfs.k_feature_names_)
    return selected, X[selected]

# ------------------------------------------------------------
# 6. Stepwise Selection (SFS dengan floating)
def stepwise_selection(df, k_features=10):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    model = LogisticRegression(max_iter=1000)
    sfs = SFS(model,
              k_features=k_features,
              forward=True,
              floating=True,
              scoring='accuracy',
              cv=5)
    sfs = sfs.fit(X, y)
    selected = list(sfs.k_feature_names_)
    return selected, X[selected]

# ------------------------------------------------------------
# 7. Genetic Algorithms for Feature Selection
def genetic_algorithm_selection(df):
    X = df.drop(TARGET, axis=1).values
    y = df[TARGET].values
    feature_names = df.drop(TARGET, axis=1).columns.tolist()
    
    creator.create("FitnessMax", base.Fitness, weights=(1.0,))
    creator.create("Individual", list, fitness=creator.FitnessMax)
    
    toolbox = base.Toolbox()
    toolbox.register("attr_bool", random.randint, 0, 1)
    toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, n=len(feature_names))
    toolbox.register("population", tools.initPopulation, list, toolbox.individual)
    
    def evalOneMax(individual):
        selected_idx = [i for i, bit in enumerate(individual) if bit == 1]
        if len(selected_idx) == 0:
            return 0,
        X_selected = X[:, selected_idx]
        model = LogisticRegression(max_iter=1000)
        scores = cross_val_score(model, X_selected, y, cv=5, scoring='accuracy')
        return scores.mean(),
    
    toolbox.register("evaluate", evalOneMax)
    toolbox.register("mate", tools.cxTwoPoint)
    toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
    toolbox.register("select", tools.selTournament, tournsize=3)
    
    population = toolbox.population(n=50)
    algorithms.eaSimple(population, toolbox, cxpb=0.5, mutpb=0.2, ngen=40, verbose=False)
    best_ind = tools.selBest(population, 1)[0]
    selected = [feature_names[i] for i, bit in enumerate(best_ind) if bit == 1]
    return selected, pd.DataFrame(X[:, [i for i, bit in enumerate(best_ind) if bit == 1]], columns=selected)

# ------------------------------------------------------------
# 8. Simulated Annealing for Feature Selection
def simulated_annealing_selection(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    def fitness_func(features):
        selected_idx = [i for i, bit in enumerate(features) if bit == 1]
        if len(selected_idx) == 0:
            return 0
        X_selected = X.iloc[:, selected_idx]
        model = LogisticRegression(max_iter=1000)
        scores = cross_val_score(model, X_selected, y, cv=5, scoring='accuracy')
        return scores.mean()
    problem = mlrose.DiscreteOpt(length=X.shape[1], 
                                 fitness_fn=mlrose.CustomFitness(fitness_func),
                                 maximize=True, max_val=2)
    best_state, best_fitness = mlrose.simulated_annealing(problem, schedule=mlrose.GeomDecay(),
                                                          max_attempts=100, max_iters=1000, random_state=42)
    selected = X.columns[best_state.astype(bool)].tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 9. Particle Swarm Optimization for Feature Selection
def particle_swarm_selection(df):
    X = df.drop(TARGET, axis=1).values
    y = df[TARGET].values
    feature_names = df.drop(TARGET, axis=1).columns.tolist()
    def fitness(x):
        binary = [1 if xi > 0.5 else 0 for xi in x]
        selected_idx = [i for i, bit in enumerate(binary) if bit == 1]
        if len(selected_idx) == 0:
            return 1  # Penalti tinggi
        X_selected = X[:, selected_idx]
        model = LogisticRegression(max_iter=1000)
        scores = cross_val_score(model, X_selected, y, cv=5, scoring='accuracy')
        return -scores.mean()  # Minimasi
    lb = [0] * X.shape[1]
    ub = [1] * X.shape[1]
    xopt, fopt = pso(fitness, lb, ub, swarmsize=50, maxiter=100, debug=False)
    selected = [feature_names[i] for i, bit in enumerate(xopt) if bit > 0.5]
    sel_idx = [i for i, bit in enumerate(xopt) if bit > 0.5]
    df_sel = pd.DataFrame(X[:, sel_idx], columns=selected)
    return selected, df_sel

# ------------------------------------------------------------
# 10. Ant Colony Optimization for Feature Selection (Ilustrasi)
def ant_colony_selection(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    feature_names = X.columns.tolist()
    def fitness(solution):
        selected = [feature_names[i] for i, bit in enumerate(solution) if bit == 1]
        if len(selected) == 0:
            return 0
        X_selected = X[selected]
        model = LogisticRegression(max_iter=1000)
        scores = cross_val_score(model, X_selected, y, cv=5, scoring='accuracy')
        return scores.mean()
    aco = AntColony(n_ants=50, n_iterations=100, decay=0.5, fitness_func=fitness)
    best_solution, best_fitness = aco.run()
    selected = [feature_names[i] for i, bit in enumerate(best_solution) if bit == 1]
    return selected, X[selected]

# ------------------------------------------------------------
# 11. Greedy Feature Selection (Menggunakan mlxtend)
def greedy_selection(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    model = LogisticRegression(max_iter=1000)
    sfs = SFS(model,
              k_features=10,
              forward=True,
              floating=False,
              scoring='accuracy',
              cv=5)
    sfs = sfs.fit(X, y)
    selected = list(sfs.k_feature_names_)
    return selected, X[selected]

# ------------------------------------------------------------
# 12. Sequential Feature Selection (Menggunakan mlxtend, floating=True)
def sequential_selection(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    model = LogisticRegression(max_iter=1000)
    sfs = SFS(model,
              k_features=10,
              forward=True,
              floating=True,
              scoring='accuracy',
              cv=5)
    sfs = sfs.fit(X, y)
    selected = list(sfs.k_feature_names_)
    return selected, X[selected]

# ------------------------------------------------------------
# 13. Lasso Regression
def lasso_selection(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    lasso = Lasso(alpha=0.01)
    lasso.fit(X, y)
    selected = X.columns[lasso.coef_ != 0].tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 14. Ridge Regression (menampilkan koefisien)
def ridge_regression_info(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    ridge = Ridge(alpha=1.0)
    ridge.fit(X, y)
    coefficients = pd.Series(ridge.coef_, index=X.columns)
    return coefficients

# ------------------------------------------------------------
# 15. Elastic Net
def elastic_net_selection(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    elastic_net = ElasticNet(alpha=0.1, l1_ratio=0.5)
    elastic_net.fit(X, y)
    selected = X.columns[elastic_net.coef_ != 0].tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 16. Decision Tree-Based Feature Selection
def decision_tree_selection(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    tree = DecisionTreeClassifier(random_state=42)
    tree.fit(X, y)
    importances = pd.Series(tree.feature_importances_, index=X.columns)
    selected = importances[importances > 0.01].index.tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 17. Random Forest Feature Importance
def random_forest_selection(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    importances = pd.Series(rf.feature_importances_, index=X.columns)
    selected = importances[importances > 0.01].index.tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 18. Gradient Boosting Feature Importance
def gradient_boosting_selection(df):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    gb = GradientBoostingClassifier(n_estimators=100, random_state=42)
    gb.fit(X, y)
    importances = pd.Series(gb.feature_importances_, index=X.columns)
    selected = importances[importances > 0.01].index.tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 19. RFE dengan Linear Models (Logistic Regression)
def rfe_linear_selection(df, k_features=10):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    model = LogisticRegression(max_iter=1000)
    rfe = RFE(estimator=model, n_features_to_select=k_features)
    rfe.fit(X, y)
    selected = X.columns[rfe.support_].tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 20. RFE dengan Tree-Based Models (Random Forest)
def rfe_tree_selection(df, k_features=10):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    rfe = RFE(estimator=model, n_features_to_select=k_features)
    rfe.fit(X, y)
    selected = X.columns[rfe.support_].tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 21. RFE dengan SVM
def rfe_svm_selection(df, k_features=10):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    model = SVC(kernel="linear")
    rfe = RFE(estimator=model, n_features_to_select=k_features)
    rfe.fit(X, y)
    selected = X.columns[rfe.support_].tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 22. RFE dengan K-Nearest Neighbors
def rfe_knn_selection(df, k_features=10):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    model = KNeighborsClassifier(n_neighbors=5)
    rfe = RFE(estimator=model, n_features_to_select=k_features)
    rfe.fit(X, y)
    selected = X.columns[rfe.support_].tolist()
    return selected, X[selected]

# ------------------------------------------------------------
# 23. RFE dengan Logistic Regression (varian lain)
def rfe_lr_selection(df, k_features=10):
    X = df.drop(TARGET, axis=1)
    y = df[TARGET]
    model = LogisticRegression(max_iter=1000)
    rfe = RFE(estimator=model, n_features_to_select=k_features)
    rfe.fit(X, y)
    selected = X.columns[rfe.support_].tolist()
    return selected, X[selected]



In [6]:
# ------------------------------------------------------------
# Main: Jalankan semua metode dan simpan hasilnya ke dictionary
methods = {
    "Chi-Square": chi_square_selection,
    "ANOVA": anova_selection,
    "Mutual Information": mutual_info_selection,
  #  "Forward Selection": forward_selection,
  #  "Backward Elimination": backward_elimination,
  # "Stepwise Selection": stepwise_selection,
  #  "Genetic Algorithm": genetic_algorithm_selection,
  #  "Simulated Annealing": simulated_annealing_selection,
    "Particle Swarm": particle_swarm_selection,
    "Ant Colony": ant_colony_selection,
    "Greedy Selection": greedy_selection,
    "Sequential Selection": sequential_selection,
    "Lasso": lasso_selection,
    "Elastic Net": elastic_net_selection,
    "Decision Tree": decision_tree_selection,
    "Random Forest": random_forest_selection,
    "Gradient Boosting": gradient_boosting_selection,
    "RFE (Logistic Regression)": rfe_linear_selection,
    "RFE (Tree-Based)": rfe_tree_selection,
    "RFE (SVM)": rfe_svm_selection,
    "RFE (KNN)": rfe_knn_selection,
    "RFE (LR Variant)": rfe_lr_selection
    # Ridge Regression hanya menampilkan koefisien, jadi tidak dikumpulkan dalam dictionary seleksi
}

results = {}  # method name -> set(selected_features)
for method_name, func in methods.items():
    try:
        selected, _ = func(df)
        # Simpan sebagai set untuk perbandingan
        results[method_name] = set(selected)
        print(f"{method_name}: {selected}")
    except Exception as e:
        print(f"{method_name} gagal dijalankan: {e}")

# ------------------------------------------------------------
# Kelompokkan metode yang menghasilkan set fitur yang sama
grouped = {}
for method, features in results.items():
    key = tuple(sorted(features))
    if key not in grouped:
        grouped[key] = []
    grouped[key].append(method)

print("\n--- Kelompok Metode dengan Hasil Seleksi Fitur yang Sama ---")
for features, methods_list in grouped.items():
    print(f"Fitur: {list(features)} => Metode: {methods_list}")

# Contoh menampilkan Ridge Regression (hanya koefisien)
print("\n--- Ridge Regression (Koefisien) ---")
ridge_coeff = ridge_regression_info(df)
print(ridge_coeff)


Chi-Square: ['age', 'height', 'weight', 'systolic', 'diastolic', 'bmi', 'pulse_pressure', 'map', 'cholesterol', 'gluc', 'smoke', 'active']
ANOVA: ['age', 'height', 'weight', 'systolic', 'diastolic', 'bmi', 'pulse_pressure', 'map', 'cholesterol', 'gluc', 'smoke', 'active']
Mutual Information: ['systolic', 'map', 'pulse_pressure', 'diastolic', 'bmi', 'cholesterol', 'weight', 'age', 'gluc', 'active']


KeyboardInterrupt: 