#### 데이터 및 모델

In [28]:
import pandas as pd
df = pd.read_csv("../../data/classification/wdbc.csv")
df = df.sample(frac = 1, random_state = 2022)
X = df.drop('y', axis = 1)
y = df['y']

In [29]:
from sklearn.linear_model import LogisticRegression
model_instance = LogisticRegression(random_state = 2022)

In [30]:
from sklearn.model_selection import cross_val_score

In [34]:
result = cross_val_score(model_instance, X, y, cv = 5, scoring = "f1")
display(result)

array([0.88888889, 0.91358025, 0.91566265, 0.95348837, 0.92857143])

In [33]:
import warnings
warnings.filterwarnings('ignore')

### 유전 알고리즘 연산자 정의

#### 해 표현 및 초기 해 집단 생성

In [35]:
import numpy as np
def initialize(n, m):
    Z = np.random.choice([0, 1], (n, m))
    Z = Z.astype(bool)
    return Z

In [40]:
def fitness(X, y, model, z):
    score = cross_val_score(model, X.loc[:, z], y, cv = 5, scoring = "f1")
    return score.mean()

#### 선택 연산자

In [75]:
def selection(Z, S, k):
    selected_index = []
    _S = S.copy()
    for _ in range(k):
        probs = _S / _S.sum()
        z_idx = np.random.multinomial(1, probs).argmax()
        selected_index.append(z_idx)
        _S[z_idx] = 0
    return Z[selected_index]

#### 교차 연산자

In [None]:
def crossover(X1, X2):
    point_idx = np.random.choice(range(1, len(X1)))
    new_X = np.hstack([X1[:point_idx], X2[point_idx:]])
    return new_X.astype(int)

#### 돌연변이 연산자

In [62]:
def bit_flip(z, p):
    probs = np.random.random(len(z))
    z[probs < p] = 1 - z[probs < p]
    return z

### 메인 코드

In [97]:
def main(n, m, k, p, q, num_generation):
    best_score = -1
    Z = initialize(n, m) # 초기해 생성
    for _ in range(num_generation):
        # 해 평가
        S = np.array([fitness(X, y, model_instance, z) for z in Z]) 
        current_best_score = S.max()
        current_best_features = Z[S.argmax()]
        
        # 최고 해 업데이트
        if current_best_score > best_score:
            best_score = current_best_score
            best_features = current_best_features
        
        # k개 해 선택
        Z_new = selection(Z, S, k) 
        
        # 교배 및 돌연변이 연산
        children = []
        for _ in range(n - k):
            parent_idx = np.random.choice(range(k), 2, replace = False)
            child = crossover(Z_new[parent_idx[0]], Z_new[parent_idx[1]])
            if np.random.random() < q:
                child = bit_flip(child, p)
            Z_new = np.vstack([Z_new, child])
        
        Z = Z_new.astype(bool)
    
    return best_features, best_score

In [100]:
n = 20
m = X.shape[1]
k = 10
num_generation = 100
p = 0.1
q = 0.1

In [101]:
best_features, best_score = main(n, m, k, p, q, num_generation)
print(X.columns[best_features], best_score)

Index(['x1', 'x3', 'x4', 'x5', 'x6', 'x7', 'x8', 'x9', 'x11', 'x13', 'x14',
       'x16', 'x17', 'x18', 'x19', 'x22', 'x23', 'x24', 'x26', 'x27', 'x28',
       'x29'],
      dtype='object') 0.9420215882509126
