In [5]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import f1_score
from deap import base, creator, tools, algorithms
import random
import numpy as np
import pandas as pd
import xgboost as xgb  # Import XGBoost
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression model
import lightgbm as lgb  # Import LightGBM
from sklearn.neighbors import KNeighborsClassifier  # Import KNN model
from sklearn.neural_network import MLPClassifier  # Import MLPClassifier

# Set random seed
random.seed(42)
np.random.seed(42)

# ------------------- Step 1: Load Data -------------------

# Load the dataset
data = pd.read_excel(r'AMI-DATA-imputed.xlsx')

# Drop rows where 'labels' is missing
data = data.dropna(subset=['labels'])

# Ensure all columns are numeric
data = data.apply(pd.to_numeric, errors='coerce')

# Separate features and labels
X = data.drop(columns=['labels'])
y = data['labels']

# ------------------- Step 3: Define Genetic Algorithm for Feature Selection -------------------
class FeatureSelectionGA:
    def __init__(self, X, y, model_name, num_folds=3, population_size=20, generations=10, cx_prob=0.8, mut_prob=0.05):
        self.X = X
        self.y = y
        self.num_folds = num_folds
        self.population_size = population_size
        self.generations = generations
        self.cx_prob = cx_prob
        self.mut_prob = mut_prob
        self.kfold = KFold(n_splits=self.num_folds, shuffle=True, random_state=42)

        # Select model based on model name
        if model_name == 'XGBoost':
            self.model = xgb.XGBClassifier(n_estimators=100, random_state=42)
        elif model_name == 'SVM':
            self.model = SVC(kernel='rbf', gamma=0.01, random_state=42)
        elif model_name == 'RandomForest':
            self.model = RandomForestClassifier(n_estimators=50, random_state=42)
        elif model_name == 'LogisticRegression':
            self.model = LogisticRegression(max_iter=5000, random_state=42)  # Set maximum iterations to ensure convergence
        elif model_name == 'LightGBM':
            self.model = lgb.LGBMClassifier(n_estimators=100, random_state=42)
        elif model_name == 'KNN':
            self.model = KNeighborsClassifier(n_neighbors=5)  # Default to 5 neighbors
        elif model_name == 'MLP':
            self.model = MLPClassifier(hidden_layer_sizes=(10,), max_iter=1000, random_state=42)
        else:
            raise ValueError("Invalid model name")

        # Define DEAP fitness and individual
        creator.create("FitnessMax", base.Fitness, weights=(1.0,))
        creator.create("Individual", list, fitness=creator.FitnessMax)

        self.toolbox = base.Toolbox()
        self.toolbox.register("attr_bool", random.randint, 0, 1)
        self.toolbox.register("individual", tools.initRepeat, creator.Individual, self.toolbox.attr_bool, n=len(X.columns))
        self.toolbox.register("population", tools.initRepeat, list, self.toolbox.individual)

        # Selection, crossover, mutation operations
        self.toolbox.register("mate", tools.cxTwoPoint)
        self.toolbox.register("mutate", tools.mutFlipBit, indpb=1.0/len(X.columns))
        self.toolbox.register("select", tools.selTournament, tournsize=3)
        self.toolbox.register("evaluate", self.evaluate)

    def evaluate(self, individual):
        """Calculate the classification F1-score for the current feature selection scheme"""
        selected_features = [feature for feature, bit in zip(self.X.columns, individual) if bit == 1]
        if len(selected_features) == 0:
            return (0,)  # Avoid the case with no features

        X_selected = self.X[selected_features]

        scores = cross_val_score(self.model, X_selected, self.y, cv=self.kfold, scoring='f1')
        return (scores.mean(),)  # Fitness is the F1-score

    def run(self):
        """Run the genetic algorithm"""
        population = self.toolbox.population(n=self.population_size)
        hof = tools.HallOfFame(1)  # Record the best solution
        stats = tools.Statistics(lambda ind: ind.fitness.values)
        stats.register("max", np.max)
        stats.register("avg", np.mean)

        population, logbook = algorithms.eaSimple(population, self.toolbox, cxpb=self.cx_prob, mutpb=self.mut_prob,
                                                  ngen=self.generations, stats=stats, halloffame=hof, verbose=True)

        best_solution = hof[0]
        best_features = [feature for feature, bit in zip(self.X.columns, best_solution) if bit == 1]

        print("\nOptimal feature selection scheme (predefined):")
        print(f"Number of selected features: {len(best_features)}")
        print(f"Selected features: {best_features}")

        return best_features

# Test code
models = ['XGBoost', 'SVM', 'RandomForest', 'LogisticRegression', 'LightGBM', 'KNN', 'MLP']
for model_name in models:
    print(f"------------------- {model_name} -------------------")
    ga = FeatureSelectionGA(X, y, model_name)
    ga.run()
    # Delete the created classes
    if 'FitnessMax' in creator.__dict__:
        del creator.__dict__['FitnessMax']
    if 'Individual' in creator.__dict__:
        del creator.__dict__['Individual']


------------------- RandomForest -------------------
Optimal feature selection scheme (predefined):
Number of selected features: 11
Selected features: ['sex', 'Hbp', 'CVD', 'CVD_inheritance', 'MAP', 'LDL-C', 'K', 'NT_proBNP', 'cTnI', 'CK-MB', 'MYO']

------------------- SVM -------------------
Optimal feature selection scheme (predefined):
Number of selected features: 8
Selected features: ['sex', 'CVD', 'Diabetes', 'CVD_inheritance', 'LDL-C', 'glucose', 'NT_proBNP', 'MYO']

------------------- XGBoost -------------------
Optimal feature selection scheme (predefined):
Number of selected features: 12
Selected features: ['sex', 'Hbp', 'CVD', 'heart_rate_odd', 'CVD_inheritance', 'smoking_index', 'glucose', 'K', 'NT_proBNP', 'cTnI', 'CK-MB', 'MYO']

------------------- LightGBM -------------------
Optimal feature selection scheme (predefined):
Number of selected features: 11
Selected features: ['age', 'BMI', 'Ki_level', 'heart_rate_odd', 'CVD_inheritance', 'smoking_index', 'MAP', 'NT_proBN