# Feature Selection - Wrapper Approach using a Genetic Algorithm
In this notebook we implement a rather simple feature selection procedure that follows a wrapper approach. The search algorithm, Genetic algorithms in this case, is wrapped around the target classification/regression algorithm.

In [1]:
# install the evolutionary computation library
!pip install deap




In [2]:
conda install -c conda-forge/label/cf202003 deap


Collecting package metadata (current_repodata.json): ...working... done
Note: you may need to restart the kernel to use updated packages.

Solving environment: ...working... done

## Package Plan ##

  environment location: C:\ProgramData\Anaconda3

  added / updated specs:
    - deap


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    conda-4.9.2                |   py38haa95532_0         2.9 MB
    deap-1.3.1                 |   py38he350917_0         157 KB  conda-forge/label/cf202003
    ------------------------------------------------------------
                                           Total:         3.0 MB

The following NEW packages will be INSTALLED:

  deap               conda-forge/label/cf202003/win-64::deap-1.3.1-py38he350917_0

The following packages will be UPDATED:

  conda                                        4.8.5-py38_0 --> 4.9.2-py38haa95532_0



Downloading and Extr

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random

from sklearn import datasets
from sklearn import linear_model
from sklearn import naive_bayes

from deap import algorithms
from deap import base
from deap import creator
from deap import tools

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [4]:
data = datasets.load_boston()

X = data["data"]
y = data["target"]

number_of_variables = X.shape[1]
input_variables = data.feature_names
target_variable = 'MEDV'

seed = 1234
np.random.seed(seed)

# let's create also a pandas data frame
df = pd.DataFrame(data.data, columns=data.feature_names)
df['MEDV'] = y
df.head()

kfolds = KFold(10,shuffle=True,random_state=seed)


In [5]:
def EvaluateFeatureSubsetSingleObjective(individual):
    selected_columns = []
    for i,allele in enumerate(individual):
        if (allele==1):
            selected_columns.append(df.columns[i])

    model = linear_model.LinearRegression()
    scores = cross_val_score(model, df[selected_columns], y, cv=kfolds)
    return scores.mean(),

## Simple Genetic Algorithm
If looks for the feature subset that maximizes the overall performance.

In [6]:
creator.create("FitnessMax", base.Fitness, weights=(1.0,))
# creator.create("Individual", list, typecode='b', fitness=creator.FitnessMax)
creator.create("Individual", list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()

# Attribute generator
toolbox.register("attr_bool", random.randint, 0, 1)

# Structure initializers
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, number_of_variables)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [7]:
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, number_of_variables)
toolbox.register("evaluate", EvaluateFeatureSubsetSingleObjective)
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selTournament, tournsize=3)

In [8]:
pop = toolbox.population(n=100)
hof = tools.HallOfFame(1)
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("std", np.std)
stats.register("min", np.min)
stats.register("max", np.max)

pop, log = algorithms.eaSimple(pop, toolbox, cxpb=0.5, mutpb=0.2, ngen=40, stats=stats, halloffame=hof, verbose=True)

gen	nevals	avg     	std     	min     	max     
0  	100   	0.508149	0.128339	0.191513	0.673682
1  	64    	0.59219 	0.0706103	0.34377 	0.673847
2  	55    	0.641121	0.0277252	0.561777	0.67879 
3  	51    	0.65754 	0.0185759	0.583299	0.680314
4  	57    	0.668995	0.00890741	0.631287	0.680314
5  	53    	0.674252	0.00925815	0.620726	0.686291
6  	65    	0.67493 	0.0130646 	0.621156	0.686291
7  	64    	0.680429	0.00785544	0.609939	0.686291
8  	54    	0.678795	0.0292791 	0.400361	0.686291
9  	70    	0.683179	0.00841339	0.625077	0.686291
10 	60    	0.683066	0.0121621 	0.594381	0.686291
11 	61    	0.684648	0.0081441 	0.61548 	0.686291
12 	71    	0.682693	0.0121567 	0.604922	0.686291
13 	50    	0.684461	0.00913979	0.61548 	0.686291
14 	60    	0.684789	0.00681876	0.640342	0.686291
15 	59    	0.684689	0.00676871	0.645059	0.686291
16 	52    	0.685605	0.00396661	0.648256	0.686291
17 	60    	0.684907	0.00673871	0.644976	0.686291
18 	61    	0.6836  	0.0147637 	0.573584	0.686291
19 	69    	0.681009	0.01868

## Multi-objective Version
It applies a multi-objective genetic algorithm that tries to maximize the performance while minimizing the number of features involved.

In [8]:
def EvaluateFeatureSubsetMultipleObjective(individual):
    '''returns the average performance and the number of features involved'''
    selected_columns = []
    for i,allele in enumerate(individual):
        if (allele==1):
            selected_columns.append(df.columns[i])

    if (len(selected_columns)>0):
        model = linear_model.LinearRegression()
        scores = cross_val_score(model, df[selected_columns], y, cv=kfolds)
        return scores.mean(),sum(individual)/float(len(individual))
    else:
        return 0,len(individual)

In [9]:
creator.create("FitnessMulti", base.Fitness, weights=(1.0, -1.0))
creator.create("Individual", list, fitness=creator.FitnessMulti)

toolbox = base.Toolbox()
# Attribute generator
toolbox.register("attr_bool", random.randint, 0, 1)
# Structure initializers
toolbox.register("individual", tools.initRepeat, creator.Individual, toolbox.attr_bool, number_of_variables)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# Operator registering
toolbox.register("evaluate", EvaluateFeatureSubsetMultipleObjective)
toolbox.register("mate", tools.cxUniform, indpb=0.1)
toolbox.register("mutate", tools.mutFlipBit, indpb=0.05)
toolbox.register("select", tools.selNSGA2)



In [10]:
# random.seed(64)
MU, LAMBDA = 100, 200
pop = toolbox.population(n=MU)
hof = tools.ParetoFront()
stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean, axis=0)
stats.register("std", np.std, axis=0)
stats.register("min", np.min, axis=0)
stats.register("max", np.max, axis=0)

pop, logbook = algorithms.eaMuPlusLambda(pop, toolbox, mu=MU, lambda_=LAMBDA,
                                         cxpb=0.7, mutpb=0.3, ngen=40, 
                                         stats=stats, halloffame=hof)

print("BEST "+str(hof[0]))


gen	nevals	avg                    	std                    	min                    	max                    
0  	100   	[0.53352882 0.52153846]	[0.11126337 0.13785337]	[0.22216453 0.15384615]	[0.67384659 0.84615385]
1  	200   	[0.5985008  0.48846154]	[0.07881769 0.16583124]	[0.21474826 0.15384615]	[0.67384659 0.92307692]
2  	200   	[0.61645502 0.48      ]	[0.07371555 0.19095524]	[0.22539109 0.15384615]	[0.67384659 0.92307692]
3  	200   	[0.62589405 0.49230769]	[0.07872359 0.23204774]	[0.21082521 0.07692308]	[0.67596593 0.92307692]
4  	200   	[0.59025659 0.36692308]	[0.10562454 0.22789362]	[0.21082521 0.07692308]	[0.68031402 0.92307692]
5  	200   	[0.61461188 0.38692308]	[0.09938336 0.21330174]	[0.12079571 0.07692308]	[0.68201157 1.        ]
6  	200   	[0.60246301 0.35538462]	[0.1354925  0.19844961]	[0.19012883 0.07692308]	[0.68331854 1.        ]
7  	200   	[0.63422572 0.34153846]	[0.05473387 0.17004002]	[0.4993403  0.07692308]	[0.68331854 0.92307692]
8  	200   	[0.59140027 0.23692308]	[0

## Discussion
Note that we applied genetic algorithms using the entire dataset for the evaluation of the feature subset. In a real scenario we should have initially split the data as train and test and then applied the genetic algorithm only using the training dataset.