In [1]:
!pip install deap pyswarms matplotlib pandas seaborn numpy scikit-learn

Collecting deap
  Downloading deap-1.4.1.tar.gz (1.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
[?25hCollecting pyswarms
  Downloading pyswarms-1.3.0-py2.py3-none-any.whl.metadata (33 kB)
Collecting matplotlib
  Downloading matplotlib-3.10.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting pandas
  Downloading pandas-2.2.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (89 kB)
Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl.metadata (5.4 kB)
Collecting numpy
  Downloading numpy-2.2.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.6.0-cp312-cp312-manylinux_2_17_x86_64.man

In [1]:
from Dataset import Dataset
from Evolution import Evolution
from sklearn.tree import DecisionTreeClassifier 
from sklearn.datasets import make_classification
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import logging
from pathlib import Path
import os

logging.getLogger("imported_module").setLevel(logging.CRITICAL)
#np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)  

# Instructions

In this notebook feature selection can be carried for the census-income datasets from the UCI repository. This code could be modified to load a different dataset. Four wrapper feature selection methods are used to identify the infromative features using a Decision Tree classifier:

* ## CHC$_{QX}$: 
The hyper-parameter choices of CHC$_{QX}$ are based on the paper “Fast Genetic Algorithm For Feature Selection - A Qualitative Approximation Approach”. The values are set to $q=10$ and $f=10$.

* ## PSO$_{QX}$: 
The hyper-parameter choices of PSO$_{QX}$ are based on the paper “Fast Genetic Algorithm For Feature Selection - A Qualitative Approximation Approach”. The values are set to $q=10$ and $f=10$.

* ## CHC: 
The implementation of a CHC algorithm is according to the paper: “The CHC Adaptive Search Algorithm: How to Have Safe Search When Engaging in Nontraditional Genetic Recombination”. The population size of is 50, the diversity parameter is set to $(d = \frac{k}{4})$, where $k$ is the length of the individual (number of features), while the divergence rate is $(div = 0.35)$.

* ## PSO:
The global version of PSO with a topology connecting all particles to one another. The following options are used \{c1: 1.49618, c2: 1.49618, w: 0.7298\}, while the number of particles is set to 50.

In [3]:
path = Path("")
work_dir = str(path.parent.absolute().parent.absolute())+'/data/'

print(work_dir)

#file_name, sep, label = 'census-income.data', ',', -1
file_name, sep, label = 'diabetes_012_health_indicators_BRFSS2015.csv', ',', -1

 
dataset = Dataset(work_dir+file_name, sep, label, divide_dataset=False, header=None)

dataset.df.dropna(subset=[dataset.df.columns[label]], inplace=True)

classifier = DecisionTreeClassifier(random_state=0)

dataset.divide_dataset(classifier,
                                      normalize=True, 
                                      shuffle=False, 
                                      all_features=True, 
                                      all_instances=True, 
                                      evaluate=True, 
                                      partial_sample=False)

task = 'feature_selection'
target_dataset = 'validation'
ind_size = dataset.X_train.shape[1]
population_size = 50

results_df = pd.DataFrame(columns=['algorithm', 'time', 'test'])

/mnt/U01/git/labs/mestrado/genetic-algorithm-feature-selection/data/


  df = pd.read_csv(file_path, header=header, sep=',')


## CHC$_{QX}$

In [4]:
log, baseline_full_data = Evolution.CHCqx(dataset, 10, 10, 2, population_size, verbose=1)

Meta-model sample size: 19026
Best Individual =  0.3737 , Gen =  20 

In [5]:
feature_subset = log.iloc[-1]['ind']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

row = ['CHC$_{QX}$', elapsed_time, accuracy]
results_df.loc[len(results_df)] = row


Test accuracy: % 37.3
Solution found in:  34.72 sec
Selected features indexes:  [11 12 13 17 19 20]


## PSO$_{QX}$

In [6]:
options = {'c1': 1.49618, 'c2': 1.49618, 'w': 0.7298, 'k': population_size, 'p':2}

log, baseline_full_data = Evolution.PSOqx(dataset, options, 10, 10, 2, population_size, verbose=1)

Meta-model sample size: 76104
Best Individual =  0.3728 , Step =  20 

In [7]:
feature_subset = log.iloc[-1]['ind']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

row = ['PSO$_{QX}$', elapsed_time, accuracy]
results_df.loc[len(results_df)] = row


Test accuracy: % 37.2
Solution found in:  91.08 sec
Selected features indexes:  [ 6 10 12 13 17 19 20]


## CHC

In [8]:
ind_size = dataset.X_train.shape[1]
toolbox = Evolution.create_toolbox(task, target_dataset, dataset, baseline_full_data)
population = Evolution.create_population(population_size, ind_size)
d = ind_size // 4
log, population, d = Evolution.CHC(dataset, toolbox, d, population, verbose=1, max_no_change=10)

Best Individual =  0.949 , Gen =  22 

In [9]:
feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

row = ['CHC', elapsed_time, accuracy]
results_df.loc[len(results_df)] = row


Test accuracy: % 94.94
Solution found in:  94.59 sec
Selected features indexes:  [12 16 17]


## PSO

In [10]:
options = {'c1': 1.49618, 'c2': 1.49618, 'w': 0.7298, 'k': population_size, 'p':2}

log = Evolution.PSO(dataset, options, population_size, steps_no_change=10, verbose=1)

Best Individual =  -0.949 , Step =  29  

In [11]:
feature_subset = log.iloc[-1]['ind']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

row = ['PSO', elapsed_time, accuracy]
results_df.loc[len(results_df)] = row


Test accuracy: % 94.95
Solution found in:  223.87 sec
Selected features indexes:  [ 6 12 16 17 19 30 31]


# Saving results

In [None]:
#results_df.to_csv(str(path.parent.absolute().parent.absolute())+'/results/results.csv')

results_df.head()

Unnamed: 0,algorithm,time,test
0,CHC$_{QX}$,32.05,94.94
1,PSO$_{QX}$,23.69,94.94
2,CHC,94.59,94.94
3,PSO,223.87,94.95
