In [1]:
import src.datasets as datasets
import src.surrogate as surrogate
import src.search as search

In [2]:
load_data_func = getattr(datasets, f'cement_data')
X_train, X_test, y_train, y_test = load_data_func('./data/concrete_processed.csv')

load_data_loader_func = getattr(datasets, f'lightgbm_load_data')
train_loader, val_loader = load_data_loader_func(X_train, X_test, y_train, y_test)

train_func = getattr(surrogate, f'lightgbm_train')
model = train_func(train_loader, val_loader)

# search_func = getattr(search, f'ga_search')
# search_func(model, train_loader, val_loader)






Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 0.0448095	valid's rmse: 0.0588017
[200]	train's rmse: 0.0342117	valid's rmse: 0.0521947
[300]	train's rmse: 0.0290689	valid's rmse: 0.050235
Early stopping, best iteration is:
[305]	train's rmse: 0.0288801	valid's rmse: 0.0501455


In [3]:
from deap import base, creator, tools
import numpy as np
import random

In [4]:
x_min,x_max = np.min(X_train, axis=0), np.max(X_train, axis=0)

In [5]:
n_features = X_train.shape[1]

In [6]:
creator.create('FitnessMax', base.Fitness, weights=(1.0,))
creator.create('Individual', np.ndarray, fitness=creator.FitnessMax)

In [7]:
def generate_individual():
    return np.random.uniform(x_min, x_max)


In [8]:
toolbox = base.Toolbox()
toolbox.register('attr_float', generate_individual)
# min_max 차원이 8개이기에 n을 1로 설정 하면 8개의 변수를 가진 ind 생성!
toolbox.register('individual', tools.initIterate, creator.Individual, toolbox.attr_float)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)


In [9]:
predict_func = getattr(surrogate, f'lightgbm_predict')

In [10]:
gt_y = y_test[0]

In [51]:
def fitness(population):

    population = np.array(population)
    print('population shape : ', population.shape)
    y_pred = predict_func(model=model, X_test=population)

    fit_fun = -(y_pred - gt_y)**2
    return fit_fun


In [12]:
toolbox.register('evaluate', fitness)


In [13]:
population = toolbox.population(n=100)


In [14]:
ETA_CX = 5.0 # 초기에 작게(탐색), 점점 크게(exploitation)
toolbox.register('mate', tools.cxSimulatedBinary, eta=ETA_CX)

In [15]:
MUTPB = 0.2          # 돌연변이 적용 확률(전역)
INDPB = 0.1          # 각 변수별 변이 확률
sigma_list = [(ub - lb)/10.0 for (lb,ub) in zip(x_min, x_max)]  # 변수 범위에 따른 sigma 값 

toolbox.register('mutate', tools.mutGaussian, mu=[0.0]*(len(x_min)), sigma=sigma_list, indpb=INDPB)


추후 selNSGA2(), selSPEA2(), selNSGA3() 등 파레토 최적화 알고리즘으로 변경 필요

In [69]:
toolbox.register('select', tools.selTournament)

In [70]:
population = toolbox.population(n=100)


In [71]:
cxpb = 0.5
mutpb = 0.5


In [72]:
offspring = [toolbox.clone(ind) for ind in population]

In [73]:
for i in range(1, len(offspring), 2):
    if random.random() < cxpb:
        offspring[i - 1], offspring[i] = toolbox.mate(offspring[i - 1],
                                                        offspring[i])
        del offspring[i - 1].fitness.values, offspring[i].fitness.values



In [74]:
offspring = [np.clip(individual, x_min, x_max) for individual in offspring]


In [75]:
for i in range(len(offspring)):
    if random.random() < mutpb:
        offspring[i], = toolbox.mutate(offspring[i])
        # del offspring[i].fitness


In [76]:
offspring = [np.clip(individual, x_min, x_max) for individual in offspring]


In [77]:
invalid_ind = [ind for ind in population if not ind.fitness.valid]

In [78]:
fitness_scores = toolbox.evaluate(invalid_ind)
for ind, fit in zip(invalid_ind, fitness_scores):
    ind.fitness.values = (fit,)

population shape :  (100, 8)


In [79]:
import faiss
def fast_nearest_neighbors(population, k):
    n, d = population.shape
    kmeans = faiss.Kmeans(d, k, niter=20, verbose=False)
    population = population.astype('float32')
    kmeans.train(population)

    cluster_labels = kmeans.index.search(population, 1)[1].flatten()  
    centroids = kmeans.centroids
    
    return cluster_labels, centroids


In [80]:
cluster_labels, centroids = fast_nearest_neighbors(np.array(population),k=len(population)//10)



In [81]:
len(population + offspring)

200

In [82]:
cluster_labels, centroids = fast_nearest_neighbors(np.array(population + offspring),k=len(population)//10)



In [83]:
cluster_labels

array([2, 7, 4, 4, 8, 0, 9, 1, 7, 5, 4, 7, 5, 5, 0, 6, 3, 7, 8, 2, 1, 6,
       7, 6, 1, 4, 3, 1, 9, 2, 5, 8, 6, 2, 9, 6, 2, 4, 3, 5, 2, 4, 4, 7,
       2, 1, 7, 5, 0, 6, 2, 0, 2, 7, 0, 8, 6, 3, 8, 2, 2, 4, 9, 3, 0, 3,
       1, 7, 8, 0, 7, 5, 1, 7, 2, 1, 7, 1, 1, 7, 2, 2, 8, 0, 8, 1, 2, 6,
       3, 5, 6, 7, 3, 8, 2, 4, 3, 5, 7, 9, 6, 7, 4, 4, 8, 0, 9, 1, 7, 5,
       4, 7, 5, 5, 0, 6, 3, 7, 8, 2, 1, 6, 7, 6, 1, 4, 3, 1, 9, 2, 5, 8,
       6, 2, 9, 6, 2, 1, 3, 5, 2, 7, 4, 7, 2, 1, 7, 5, 0, 6, 2, 0, 6, 7,
       0, 4, 2, 3, 8, 2, 2, 4, 9, 3, 0, 3, 1, 7, 8, 0, 7, 5, 1, 7, 2, 1,
       8, 1, 1, 7, 2, 2, 8, 0, 8, 1, 2, 6, 3, 7, 2, 4, 3, 8, 2, 4, 3, 2,
       7, 9])

In [84]:
i = 1
cluster_labels == i

array([False, False, False, False, False, False, False,  True, False,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False,  True, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
        True, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
        True, False, False,  True, False,  True,  True, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False,  True, False,
       False,  True,

In [85]:
cluster_idx = np.where(cluster_labels == i)[0]


In [86]:
cluster_idx

array([  7,  20,  24,  27,  45,  66,  72,  75,  77,  78,  85, 107, 120,
       124, 127, 137, 145, 166, 172, 175, 177, 178, 185])

In [87]:
k = [(population + offspring)[i] for i in cluster_idx]

In [88]:
toolbox.select

functools.partial(<function selTournament at 0x7ff110fa7a30>)

In [89]:
len(population + offspring)//2

100

In [90]:
len([(population + offspring)[i] for i in cluster_idx])


23

In [91]:
len(population)//10

10

In [94]:
pop = toolbox.select([(population + offspring)[i] for i in cluster_idx],k=len(population + offspring)//2,tournsize=3)


TypeError: '>' not supported between instances of 'MetaCreator' and 'MetaCreator'

In [42]:
# %%time
# offspring = list(map(toolbox.clone, population))


In [35]:
# selTournamentDCD는 4의 배수여야함 ... 
# offspring = toolbox.select(population, len(population))