In [16]:
from Dataset import Dataset
from Evolution import Evolution
from sklearn.tree import DecisionTreeClassifier 
from sklearn.datasets import make_classification
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import logging
from pathlib import Path
import os

logging.getLogger("imported_module").setLevel(logging.CRITICAL)
#np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)  

# Instructions

In this notebook feature selection can be carried for the census-income datasets from the UCI repository. This code could be modified to load a different dataset. Four wrapper feature selection methods are used to identify the infromative features using a Decision Tree classifier:

* ## CHC$_{QX}$: 
The hyper-parameter choices of CHC$_{QX}$ are based on the paper “Fast Genetic Algorithm For Feature Selection - A Qualitative Approximation Approach”. The values are set to $q=10$ and $f=10$.

* ## PSO$_{QX}$: 
The hyper-parameter choices of PSO$_{QX}$ are based on the paper “Fast Genetic Algorithm For Feature Selection - A Qualitative Approximation Approach”. The values are set to $q=10$ and $f=10$.

* ## CHC: 
The implementation of a CHC algorithm is according to the paper: “The CHC Adaptive Search Algorithm: How to Have Safe Search When Engaging in Nontraditional Genetic Recombination”. The population size of is 50, the diversity parameter is set to $(d = \frac{k}{4})$, where $k$ is the length of the individual (number of features), while the divergence rate is $(div = 0.35)$.

* ## PSO:
The global version of PSO with a topology connecting all particles to one another. The following options are used \{c1: 1.49618, c2: 1.49618, w: 0.7298\}, while the number of particles is set to 50.

In [17]:
path = Path("")
work_dir = str(path.parent.absolute().parent.absolute())+'/data/'

print(work_dir)

file_name, sep, label = 'census-income.data', ',', -1

 
dataset = Dataset(work_dir+file_name, sep, label, divide_dataset=False, header=None)

dataset.df.dropna(subset=[dataset.df.columns[label]], inplace=True)

classifier = DecisionTreeClassifier(random_state=0)

dataset.divide_dataset(classifier,
                                      normalize=True, 
                                      shuffle=False, 
                                      all_features=True, 
                                      all_instances=True, 
                                      evaluate=True, 
                                      partial_sample=False)

task = 'feature_selection'
target_dataset = 'validation'
ind_size = dataset.X_train.shape[1]
population_size = 50

results_df = pd.DataFrame(columns=['algorithm', 'time', 'test'])

/opt/git/labs/mestrado/Fast-Genetic-Algorithm-For-Feature-Selection/data/


AttributeError: `np.NaN` was removed in the NumPy 2.0 release. Use `np.nan` instead.

## CHC$_{QX}$

In [33]:
log, baseline_full_data = Evolution.CHCqx(dataset, 10, 10, 2, population_size, verbose=1)

Meta-model sample size: 14964
Best Individual =  0.9491 , Gen =  60 

In [35]:
feature_subset = log.iloc[-1]['ind']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

row = ['CHC$_{QX}$', elapsed_time, accuracy]
results_df.loc[len(results_df)] = row


Test accuracy: % 94.96
Solution found in:  55.46 sec
Selected features indexes:  [12 16 17 19 40]


## PSO$_{QX}$

In [37]:
options = {'c1': 1.49618, 'c2': 1.49618, 'w': 0.7298, 'k': population_size, 'p':2}

log, baseline_full_data = Evolution.PSOqx(dataset, options, 10, 10, 2, population_size, verbose=1)

Meta-model sample size: 29928
Best Individual =  0.9492 , Step =  10 

In [38]:
feature_subset = log.iloc[-1]['ind']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

row = ['PSO$_{QX}$', elapsed_time, accuracy]
results_df.loc[len(results_df)] = row


Test accuracy: % 95.09
Solution found in:  33.95 sec
Selected features indexes:  [ 3 12 14 16 17 22 23 31]


## CHC

In [39]:
ind_size = dataset.X_train.shape[1]
toolbox = Evolution.create_toolbox(task, target_dataset, dataset, baseline_full_data)
population = Evolution.create_population(population_size, ind_size)
d = ind_size // 4
log, population, d = Evolution.CHC(dataset, toolbox, d, population, verbose=1, max_no_change=10)

Best Individual =  0.9491 , Gen =  31 

In [40]:
feature_subset = log.iloc[-1]['best_solution']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

row = ['CHC', elapsed_time, accuracy]
results_df.loc[len(results_df)] = row


Test accuracy: % 94.93
Solution found in:  119.52 sec
Selected features indexes:  [10 12 16 17]


## PSO

In [41]:
options = {'c1': 1.49618, 'c2': 1.49618, 'w': 0.7298, 'k': population_size, 'p':2}

log = Evolution.PSO(dataset, options, population_size, steps_no_change=10, verbose=1)

Best Individual =  -0.9491 , Step =  33 

In [42]:
feature_subset = log.iloc[-1]['ind']  
feature_subset = np.array(feature_subset)

elapsed_time = np.round(log.iloc[-1]['time'], 2)
accuracy = np.round(100*Evolution.evaluate(feature_subset, 'feature_selection', 'test', dataset)[0], 2)

print()
print('Test accuracy: %', accuracy)
print('Solution found in: ', elapsed_time, 'sec')
print('Selected features indexes: ', np.where(feature_subset == 1)[0])

row = ['PSO', elapsed_time, accuracy]
results_df.loc[len(results_df)] = row


Test accuracy: % 94.91
Solution found in:  230.08 sec
Selected features indexes:  [ 6 10 12 16 17 19 29 31 40]


# Saving results

In [21]:
results_df.to_csv(str(path.parent.absolute().parent.absolute())+'/results/results.csv')