# Bloque 1 — Feature Selection con GA
Usa **DEAP** + **LogisticRegression** con CV.


In [None]:
from utils.data import load_data
from utils.ga_tools import set_seed, cv_score, penalty_k
import numpy as np
import random, os, joblib, matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from deap import base, creator, tools

set_seed(42)
X_train, X_test, y_train, y_test, feature_names = load_data()
n_features = X_train.shape[1]
n_features

## Configuración del GA

In [None]:
POP_SIZE = 50
N_GEN = 50
CX_PB = 0.8
MUT_PB = 0.2
INDPB = 0.05
TOUR_SIZE = 3
ALPHA = 0.005
ELITISM = 1

def make_estimator():
    return LogisticRegression(max_iter=2000, solver='lbfgs')

def ensure_one_active(individual):
    if sum(individual) == 0:
        idx = random.randrange(len(individual))
        individual[idx] = 1


## DEAP: espacio de búsqueda y fitness

In [None]:
creator.create('FitnessMax', base.Fitness, weights=(1.0,))
creator.create('Individual', list, fitness=creator.FitnessMax)

toolbox = base.Toolbox()
toolbox.register('attr_bool', random.randint, 0, 1)
toolbox.register('individual', tools.initRepeat, creator.Individual, toolbox.attr_bool, n=n_features)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)

def eval_individual(individual):
    ensure_one_active(individual)
    mask = np.array(individual, dtype=bool)
    k = int(mask.sum())
    X_sel = X_train[:, mask]
    est = make_estimator()
    acc = cv_score(est, X_sel, y_train, cv=5, scoring='accuracy', random_state=42)
    fit = acc - penalty_k(k, ALPHA)
    return (fit,)

toolbox.register('evaluate', eval_individual)
toolbox.register('mate', tools.cxOnePoint)
toolbox.register('mutate', tools.mutFlipBit, indpb=INDPB)
toolbox.register('select', tools.selTournament, tournsize=TOUR_SIZE)


## Evolución

In [None]:
pop = toolbox.population(n=POP_SIZE)
hof = tools.HallOfFame(ELITISM)

def pop_stats(p):
    fits = [ind.fitness.values[0] for ind in p]
    return float(np.mean(fits)), float(np.min(fits)), float(np.max(fits))

for ind in pop:
    ind.fitness.values = toolbox.evaluate(ind)

log = []
for gen in range(N_GEN):
    elites = tools.selBest(pop, ELITISM) if ELITISM > 0 else []
    offspring = toolbox.select(pop, len(pop) - ELITISM)
    offspring = list(map(toolbox.clone, offspring))
    for i in range(0, len(offspring), 2):
        if i+1 < len(offspring) and random.random() < CX_PB:
            toolbox.mate(offspring[i], offspring[i+1])
            del offspring[i].fitness.values, offspring[i+1].fitness.values
    for i in range(len(offspring)):
        if random.random() < MUT_PB:
            toolbox.mutate(offspring[i])
            del offspring[i].fitness.values
    invalid = [ind for ind in offspring if not ind.fitness.valid]
    for ind in invalid:
        ind.fitness.values = toolbox.evaluate(ind)
    pop = elites + offspring
    avg, mn, mx = pop_stats(pop)
    log.append({'gen': gen+1, 'avg': avg, 'min': mn, 'max': mx})

best = tools.selBest(pop, 1)[0]
best_mask = np.array(best, dtype=bool)
best_k = int(best_mask.sum())
best_fitness = float(best.fitness.values[0])
best_k, best_fitness

## Resultados y comparación

In [None]:
est_base = LogisticRegression(max_iter=2000, solver='lbfgs')
baseline_cv = cv_score(est_base, X_train, y_train, cv=5, scoring='accuracy', random_state=42)
X_sel = X_train[:, best_mask]
est_sel = LogisticRegression(max_iter=2000, solver='lbfgs')
selected_cv = cv_score(est_sel, X_sel, y_train, cv=5, scoring='accuracy', random_state=42)
selected_features = list(feature_names[best_mask])
print(f'Features seleccionadas (k={best_k}):')
for f in selected_features:
    print('-', f)
print(f'Baseline CV accuracy (todas): {baseline_cv:.4f}')
print(f'Seleccionadas CV accuracy:   {selected_cv:.4f}')


## Curva de mejor fitness por generación

In [None]:
gens = [r['gen'] for r in log]
best_curve = [r['max'] for r in log]
import matplotlib.pyplot as plt
plt.figure()
plt.plot(gens, best_curve, marker='o')
plt.xlabel('Generación')
plt.ylabel('Mejor fitness')
plt.title('Evolución del mejor fitness')
plt.grid(True)
plt.show()


## Guardado opcional con joblib

In [None]:
import os, joblib
os.makedirs('results', exist_ok=True)
out = {
    'best_mask': best_mask,
    'selected_features': selected_features,
    'best_fitness': best_fitness,
    'baseline_cv': baseline_cv,
    'selected_cv': selected_cv,
    'log': log,
}
joblib.dump(out, 'results/bloque1_selection.joblib')
"Guardado en results/bloque1_selection.joblib"