## evaluation metric 만들기 : seach model에서 여러 예측 값을 뽑고, test set x와 가장 유사한 값과의 비교

In [6]:
import src.datasets as datasets
import src.surrogate as surrogate
import src.search as search

In [7]:
load_data_func = getattr(datasets, f'cement_data')
X_train, X_test, y_train, y_test = load_data_func('./data/concrete_processed.csv')

load_data_loader_func = getattr(datasets, f'lightgbm_load_data')
train_loader, val_loader = load_data_loader_func(X_train, X_test, y_train, y_test)

train_func = getattr(surrogate, f'lightgbm_train')
pred_func = getattr(surrogate, f'lightgbm_predict')
model = train_func(train_loader, val_loader)





Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 0.0448095	valid's rmse: 0.0588017
[200]	train's rmse: 0.0342117	valid's rmse: 0.0521947
[300]	train's rmse: 0.0290689	valid's rmse: 0.050235
Early stopping, best iteration is:
[305]	train's rmse: 0.0288801	valid's rmse: 0.0501455


In [8]:
import random
import torch
from tqdm import tqdm
import numpy as np
from deap import base, creator, tools

def adaptive_niche_size(gen, max_gen, initial_sigma, min_sigma, decay_constant=5.0):
    """
    진화 단계에 따라 적응적으로 니치 크기를 조정하는 함수
    - gen: 현재 세대
    - max_gen: 최대 세대 수
    - initial_sigma: 초기 니치 크기
    - min_sigma: 최소 니치 크기
    - decay_constant: 감소율 상수
    """
    sigma = initial_sigma * np.exp(-decay_constant * gen / max_gen) # gen/max_gen : 현재 세대 비율(정규화한 값)
    return max(sigma, min_sigma)  # 최소 니치 크기 보장

def fitness_sharing(population, sigma, alpha):
    """
    적응적 니치 크기를 적용한 적합도 공유 함수
    목적 : 개체들이 특정 니치에 너무 몰리지 않도록 적합도를 조정
    - population: 현재 개체군
    - sigma: 니치 크기
    - alpha: 거리의 중요도 조정 파라미터
    """
    # 개체군을 numpy 배열로 변환
    population_array = np.array([np.array(ind).squeeze() for ind in population])
    # print(population_array.shape) # (50, 8)

    # 개체 간 거리 계산 (모든 개체 쌍의 거리)
    distances = np.linalg.norm(
        population_array[:, np.newaxis, :] - population_array[np.newaxis, :, :], axis=2
    )
    # print(distances.shape) # (50, 50)

    # 공유 함수 값 계산 : dist < sigma일 때만 적합도를 공유하도록 설계
    sh_values = np.where(
        distances < sigma, 1 - (distances / sigma) ** alpha, 0
    )
    # print(sh_values.shape) # (50, 50)

    # 다른 개체들과 적합도를 나누는 정도를 나타냄
    sharing_factors = np.sum(sh_values, axis=1)  # 각 개체별 sharing factor 계산
    # print(sharing_factors.shape) # (50,)

    # 적합도 조정
    for ind, sharing_factor in zip(population, sharing_factors):
        if sharing_factor > 0.0:  # 근처에 다른 개체가 있는 경우
            ind.fitness.values = (ind.fitness.values[0] / sharing_factor,)

    # for ind in population:
    #     # ind와 다른 모든 개체 사이의 거리 계산(자기 자신 포함)
    #     distances = [
    #         np.linalg.norm(np.array(ind) - np.array(other)) for other in population
    #     ]
    #     # print(len(distances)) # 50

    #     # 공유 함수 값 계산 : dist < sigma일 때만 적합도를 공유하도록 설계
    #     sh_values = [
    #         1 - (dist / sigma) ** alpha if dist < sigma else 0
    #         for dist in distances
    #     ] # distances를 기반으로 적합도 공유 비율을 나타내는 리스트

    #     # 다른 개체들과 적합도를 나누는 정도를 나타냄
    #     # sharing_factor가 크다 = 해당 개체와 가까운 개체가 많다
    #     sharing_factor = sum(sh_values)
    #     # print(sharing_factor) # 12.976303413814554
        
    #     # 적합도 조정
    #     if sharing_factor > 0: # 근처에 다른 개체가 있는 경우
    #         # print(ind.fitness.values)
    #         ind.fitness.values = (ind.fitness.values[0] / sharing_factor,)

def ga_adaptive_niching_search(model, pred_func, X_train, X_test, y_test, max_gen=10, initial_sigma=2.0, min_sigma=0.5, decay_constant=5.0):
    """
    - model: 예측에 사용되는 딥러닝 모델 또는 함수
    - pred_func: 입력(X_test)에 대해 model의 예측값을 반환하는 함수
    - X_train: 입력값의 학습 데이터
    - X_test: 최적화를 수행할 테스트 데이터
    - y_test: 테스트 데이터에 대한 ground truth
    - max_gen: GA의 최대 세대 수 (디폴트값: 100)
    - initial_sigma: 초기 니치 크기 (디폴트값: 2.0)
    - min_sigma: 최소 니치 크기 (디폴트값: 0.5)
    - decay_constant: 니치 크기 감소율 상수 (디폴트값: 5.0)
    """
    test = X_test
    gt_ys = y_test

    x_min = np.min(X_train, axis=0)
    x_max = np.max(X_train, axis=0)

    res = []
    for gt_y in tqdm(gt_ys):
        
        # 적합도 함수 정의
        def fitness(population):
            population = np.concatenate(population, axis=0)
            y_pred = pred_func(model=model, X_test=population)
            y_pred_tensor = torch.tensor(y_pred, dtype=torch.float32)
            fit_fun = -(y_pred_tensor - gt_y) ** 2
            return fit_fun
        
        # GA 기본 설정
        creator.create('FitnessMax', base.Fitness, weights=(1.0,)) # 적합도 최대화 문제를 정의
        creator.create('Individual', list, fitness=creator.FitnessMax) # 개체 클래스 정의
        toolbox = base.Toolbox()
        toolbox.register('attr_float', random.uniform, x_min, x_max) # 각 유전자 값은 x_min과 x_max 사이의 실수로 초기화
        toolbox.register('individual', tools.initRepeat, creator.Individual, toolbox.attr_float, n=1) # 유전자를 모아 개체 생성
        toolbox.register('population', tools.initRepeat, list, toolbox.individual) # 개체를 모아 개체군을 생성

        # GA 연산 등록
        toolbox.register('evaluate', fitness) # 적합도 평가함수로 fitness 사용
        toolbox.register('select', tools.selBest, k=5)  # Best Selection : 상위 5개의 개체를 선택하는 selBest 방식
        toolbox.register('mate', tools.cxBlend, alpha=0.5) # crossover : 개체 간 교배(cxBlend)로 새로운 개체 생성
        toolbox.register('mutate', tools.mutGaussian, mu=0, sigma=1, indpb=0.2) # mutation : 개체의 일부 유전자를 가우시안 노이즈로 변형

        pop_size = 50
        population = toolbox.population(n=pop_size)

        # GA 루프
        for gen in range(max_gen):

            # 적합도 계산
            fitness_scores = toolbox.evaluate(population)
            for ind, fit in zip(population, fitness_scores):
                ind.fitness.values = (fit,)

            if len(population) == 1:
                break
            
            # 다음 세대 생성
            parents = toolbox.select(population, k=len(population))
            offspring = tools.selBest(parents, k=len(population))
            offspring = list(map(toolbox.clone, offspring))

            # mate
            for i in range(1, len(offspring), 2):
                if random.random() < 0.7:
                    toolbox.mate(offspring[i - 1], offspring[i])

            # mutation
            for child in offspring:
                if random.random() < 0.2:
                    toolbox.mutate(child)

            for ind in offspring:
                # del ind.fitness.values
                if not ind.fitness.valid:
                    fitness_scores = toolbox.evaluate([ind])
                    ind.fitness.values = (fitness_scores[0],)

            # 니치 크기 조정
            sigma = adaptive_niche_size(gen, max_gen, initial_sigma, min_sigma, decay_constant)
            fitness_sharing(offspring, sigma, alpha=1.0)

            population[:] = offspring

        # 최적 결과 반환
        best_individual = tools.selBest(population, k=1)[0]
        best_individual = best_individual[0]

        x_pred = np.array(best_individual)
        x_pred = x_pred.reshape(1, 8)
        res.append(x_pred)
        
    return np.concatenate(res, axis=0)

In [9]:
ga_adaptive_niching_search(model, pred_func, X_train, X_test, y_test, max_gen=10, initial_sigma=2.0, min_sigma=0.5, decay_constant=5.0)

100%|██████████| 186/186 [00:10<00:00, 17.75it/s]


array([[ 0.30028415,  0.30028415,  0.30028415, ...,  0.30028415,
         0.30028415, -1.19664022],
       [-0.24855632, -0.24855632, -0.24855632, ..., -0.24855632,
        -0.24855632, -1.01446206],
       [ 0.87095575,  0.87095575,  0.87095575, ...,  0.87095575,
         0.87095575,  3.64970165],
       ...,
       [ 0.0562841 ,  0.0562841 ,  0.0562841 , ...,  0.0562841 ,
         0.0562841 , -0.77929245],
       [ 0.67711016,  0.67711016,  0.67711016, ...,  0.67711016,
         0.67711016,  2.5771396 ],
       [ 0.01313539,  0.01313539,  0.01313539, ...,  0.01313539,
         0.01313539, -1.22329402]])

### best individual을 1개가 아닌 50개 전부 가져오기

In [67]:
x_min = np.min(X_train, axis=0)
x_max = np.max(X_train, axis=0)

In [68]:
one_x, one_y = X_test[0], y_test[0]
print(one_x, one_y)

[0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.26453488
 0.51470588 0.        ] [0.27715064]


In [69]:
def fitness(population):
    population = np.concatenate(population, axis=0)
    y_pred = pred_func(model=model, X_test=population)
    y_pred_tensor = torch.tensor(y_pred, dtype=torch.float32)
    fit_fun = -(y_pred_tensor - one_y) ** 2
    return fit_fun

In [70]:
creator.create('FitnessMax', base.Fitness, weights=(1.0,)) # 적합도 최대화 문제를 정의
creator.create('Individual', list, fitness=creator.FitnessMax) # 개체 클래스 정의
toolbox = base.Toolbox()
toolbox.register('attr_float', random.uniform, x_min, x_max) # 각 유전자 값은 x_min과 x_max 사이의 실수로 초기화
toolbox.register('individual', tools.initRepeat, creator.Individual, toolbox.attr_float, n=1) # 유전자를 모아 개체 생성
toolbox.register('population', tools.initRepeat, list, toolbox.individual) # 개체를 모아 개체군을 생성

# GA 연산 등록
toolbox.register('evaluate', fitness) # 적합도 평가함수로 fitness 사용
toolbox.register('select', tools.selBest, k=5)  # Best Selection : 상위 5개의 개체를 선택하는 selBest 방식
toolbox.register('mate', tools.cxBlend, alpha=0.5) # crossover : 개체 간 교배(cxBlend)로 새로운 개체 생성
toolbox.register('mutate', tools.mutGaussian, mu=0, sigma=1, indpb=0.2) # mutation : 개체의 일부 유전자를 가우시안 노이즈로 변형



In [71]:
pop_size = 50
population = toolbox.population(n=pop_size)

In [72]:
# 적합도 계산
fitness_scores = toolbox.evaluate(population)
for ind, fit in zip(population, fitness_scores):
    ind.fitness.values = (fit,)

if len(population) == 1:
    pass

In [73]:
print(fitness_scores.shape)

torch.Size([50, 1])


In [74]:
# 다음 세대 생성
parents = toolbox.select(population, k=len(population))
offspring = tools.selBest(parents, k=len(population))
offspring = list(map(toolbox.clone, offspring))

In [75]:
# 다음 세대 생성
parents = toolbox.select(population, k=len(population))
offspring = tools.selBest(parents, k=len(population))
offspring = list(map(toolbox.clone, offspring))

# mate
for i in range(1, len(offspring), 2):
    if random.random() < 0.7:
        toolbox.mate(offspring[i - 1], offspring[i])

# mutation
for child in offspring:
    if random.random() < 0.2:
        toolbox.mutate(child)

for ind in offspring:
    # del ind.fitness.values
    if not ind.fitness.valid:
        fitness_scores = toolbox.evaluate([ind])
        ind.fitness.values = (fitness_scores[0],)

In [79]:
best_individual = tools.selBest(population, k=len(population))
print(len(best_individual))
print(len(best_individual[0][0]))
print(best_individual[0][0])

50
8
[ 0.02735379  0.02735379  0.02735379  0.03024283  0.02723001  0.02735379
  0.02735379 -1.1307095 ]


### gt x와의 유클리드 거리 계산하기 (for문)

In [32]:
type(one_x)


numpy.ndarray

In [49]:
best_individual = tools.selBest(population, k=1)[0]
print(best_individual)

[array([ 0.01434638,  0.01434638,  0.01434638,  0.01727406,  0.01428146,
        0.01434638,  0.01434638, -1.20441814])]


In [93]:
best_individual = tools.selBest(population, k=len(population))
print(len(best_individual))
print(len(best_individual[0][0]))

50
8


In [94]:
dist = []
for ind in best_individual:
    # print(type(ind)
    ind = np.array(ind)
    # print(ind.shape) # (1, 8)
    # distance1 = np.linalg.norm(one_x - ind)
    # print(one_x.shape) # (8, )
    distance = np.linalg.norm(one_x - ind, axis=1)
    # dist.append(distance1)
    dist.append(distance)
print(dist)
print(len(dist))
print(np.argmin(dist))
idx = np.argmin(dist)
print(best_individual[idx])


[array([1.56141621]), array([1.70833512]), array([1.60016184]), array([1.59917426]), array([1.70026852]), array([1.64508362]), array([1.39116122]), array([1.31275807]), array([1.31077764]), array([1.29785323]), array([1.16617573]), array([1.13802479]), array([0.90246977]), array([0.67232015]), array([0.8591729]), array([2.08292403]), array([2.08727548]), array([2.13314479]), array([3.97164566]), array([3.96024169]), array([3.72376516]), array([4.24076151]), array([4.51943871]), array([4.48325018]), array([4.54466483]), array([4.14111443]), array([4.11467768]), array([4.08395768]), array([1.97182677]), array([3.62961344]), array([2.2300868]), array([2.20168055]), array([1.96445686]), array([3.15394012]), array([3.46560778]), array([3.48362019]), array([3.31490768]), array([2.44547355]), array([2.4466868]), array([1.09732307]), array([1.31929938]), array([0.67413854]), array([0.66931077]), array([0.6755061]), array([0.87751231]), array([0.63322943]), array([0.93075508]), array([0.6231572

In [50]:
type(best_individual)

deap.creator.Individual

In [80]:
best_individual = tools.selBest(population, k=len(population))
print(len(best_individual))
print(len(best_individual[0][0]))

50
8


### gt x와의 유클리드 거리 계산하기 (numpy array)

In [81]:
best_individual = np.array(best_individual)
print(type(best_individual))
print(best_individual.shape)
print(best_individual)

<class 'numpy.ndarray'>
(50, 1, 8)
[[[ 2.73537859e-02  2.73537859e-02  2.73537859e-02  3.02428341e-02
    2.72300131e-02  2.73537859e-02  2.73537859e-02 -1.13070950e+00]]

 [[ 1.98631079e-03  1.98631079e-03  1.98631079e-03  4.95070789e-03
    1.97732296e-03  1.98631079e-03  1.98631079e-03 -1.27445852e+00]]

 [[ 2.06262774e-02  2.06262774e-02  2.06262774e-02  2.35353083e-02
    2.05329458e-02  2.06262774e-02  2.06262774e-02 -1.16883205e+00]]

 [[ 2.07973923e-02  2.07973923e-02  2.07973923e-02  2.37059148e-02
    2.07032864e-02  2.07973923e-02  2.07973923e-02 -1.16786240e+00]]

 [[ 3.36975927e-03  3.36975927e-03  3.36975927e-03  6.33004711e-03
    3.35451149e-03  3.36975927e-03  3.36975927e-03 -1.26661898e+00]]

 [[ 1.28615841e-02  1.28615841e-02  1.28615841e-02  1.57936784e-02
    1.28033869e-02  1.28615841e-02  1.28615841e-02 -1.21283198e+00]]

 [[ 5.73227244e-02  5.73227244e-02  5.73227244e-02  6.01227559e-02
    5.70633456e-02  5.73227244e-02  5.73227244e-02 -9.60885514e-01]]

 [[ 7.

In [82]:
squeezed_array = best_individual.squeeze(axis=1)  # axis=1은 1인 차원을 제거
print(squeezed_array.shape)
print(squeezed_array)



(50, 8)
[[ 2.73537859e-02  2.73537859e-02  2.73537859e-02  3.02428341e-02
   2.72300131e-02  2.73537859e-02  2.73537859e-02 -1.13070950e+00]
 [ 1.98631079e-03  1.98631079e-03  1.98631079e-03  4.95070789e-03
   1.97732296e-03  1.98631079e-03  1.98631079e-03 -1.27445852e+00]
 [ 2.06262774e-02  2.06262774e-02  2.06262774e-02  2.35353083e-02
   2.05329458e-02  2.06262774e-02  2.06262774e-02 -1.16883205e+00]
 [ 2.07973923e-02  2.07973923e-02  2.07973923e-02  2.37059148e-02
   2.07032864e-02  2.07973923e-02  2.07973923e-02 -1.16786240e+00]
 [ 3.36975927e-03  3.36975927e-03  3.36975927e-03  6.33004711e-03
   3.35451149e-03  3.36975927e-03  3.36975927e-03 -1.26661898e+00]
 [ 1.28615841e-02  1.28615841e-02  1.28615841e-02  1.57936784e-02
   1.28033869e-02  1.28615841e-02  1.28615841e-02 -1.21283198e+00]
 [ 5.73227244e-02  5.73227244e-02  5.73227244e-02  6.01227559e-02
   5.70633456e-02  5.73227244e-02  5.73227244e-02 -9.60885514e-01]
 [ 7.14075099e-02  7.14075099e-02  7.14075099e-02  7.41657054

In [83]:
one_x.shape

(8,)

In [84]:
expanded_one_x = np.tile(one_x, (50, 1))
print(expanded_one_x.shape)
print(expanded_one_x)

(50, 8)
[[0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.26453488
  0.51470588 0.        ]
 [0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.26453488
  0.51470588 0.        ]
 [0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.26453488
  0.51470588 0.        ]
 [0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.26453488
  0.51470588 0.        ]
 [0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.26453488
  0.51470588 0.        ]
 [0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.26453488
  0.51470588 0.        ]
 [0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.26453488
  0.51470588 0.        ]
 [0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.26453488
  0.51470588 0.        ]
 [0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.26453488
  0.51470588 0.        ]
 [0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.26453488
  0.51470588 0.        ]
 [0.10730594 0.40631394 0.54472764 0.65346535 0.27149321 0.264

In [89]:
# 두 배열의 차이를 계산
differences = squeezed_array - expanded_one_x  # (50, 8)
print(differences.shape)
print(differences)



(50, 8)
[[-7.99521502e-02 -3.78960157e-01 -5.17373850e-01 -6.23222512e-01
  -2.44263200e-01 -2.37181098e-01 -4.87352096e-01 -1.13070950e+00]
 [-1.05319625e-01 -4.04327633e-01 -5.42741325e-01 -6.48514639e-01
  -2.69515890e-01 -2.62548573e-01 -5.12719572e-01 -1.27445852e+00]
 [-8.66796587e-02 -3.85687666e-01 -5.24101359e-01 -6.29930038e-01
  -2.50960267e-01 -2.43908606e-01 -4.94079605e-01 -1.16883205e+00]
 [-8.65085438e-02 -3.85516551e-01 -5.23930244e-01 -6.29759432e-01
  -2.50789926e-01 -2.43737491e-01 -4.93908490e-01 -1.16786240e+00]
 [-1.03936177e-01 -4.02944184e-01 -5.41357877e-01 -6.47135299e-01
  -2.68138701e-01 -2.61165124e-01 -5.11336123e-01 -1.26661898e+00]
 [-9.44443519e-02 -3.93452359e-01 -5.31866052e-01 -6.37671668e-01
  -2.58689826e-01 -2.51673300e-01 -5.01844298e-01 -1.21283198e+00]
 [-4.99832116e-02 -3.48991219e-01 -4.87404912e-01 -5.93342591e-01
  -2.14429867e-01 -2.07212159e-01 -4.57383158e-01 -9.60885514e-01]
 [-3.58984262e-02 -3.34906433e-01 -4.73320126e-01 -5.79299641

In [92]:
# 각 행의 유클리드 거리 계산
distances = np.linalg.norm(differences, axis=1)  # 결과는 (50,)
print(distances.shape)
print(distances)
print(np.argmin(distances))
idx = np.argmin(distances)
print(best_individual[idx])


(50,)
[1.56141621 1.70833512 1.60016184 1.59917426 1.70026852 1.64508362
 1.39116122 1.31275807 1.31077764 1.29785323 1.16617573 1.13802479
 0.90246977 0.67232015 0.8591729  2.08292403 2.08727548 2.13314479
 3.97164566 3.96024169 3.72376516 4.24076151 4.51943871 4.48325018
 4.54466483 4.14111443 4.11467768 4.08395768 1.97182677 3.62961344
 2.2300868  2.20168055 1.96445686 3.15394012 3.46560778 3.48362019
 3.31490768 2.44547355 2.4466868  1.09732307 1.31929938 0.67413854
 0.66931077 0.6755061  0.87751231 0.63322943 0.93075508 0.62315728
 0.61544914 1.80015832]
48
[[0.25608177 0.25608177 0.25608177 0.25829143 0.25492303 0.25608177
  0.25608177 0.16541575]]


### rmse 계산하기

In [97]:
res = best_individual[idx]
print(len(res[0]))
print(one_x.shape)


8
(8,)


In [100]:
one_x

array([0.10730594, 0.40631394, 0.54472764, 0.65346535, 0.27149321,
       0.26453488, 0.51470588, 0.        ])

In [101]:
res

[array([0.25608177, 0.25608177, 0.25608177, 0.25829143, 0.25492303,
        0.25608177, 0.25608177, 0.16541575])]

In [102]:
one_x - res

array([[-0.14877583,  0.15023217,  0.28864587,  0.39517392,  0.01657018,
         0.00845311,  0.25862411, -0.16541575]])

In [99]:
rmse = np.sqrt(np.mean((one_x - res) ** 2))
print(rmse)

0.2175941297937622
