In [1]:
import src.datasets as datasets
import src.surrogate as surrogate
import src.search as search

In [2]:
from deap import base, creator, tools
import numpy as np
import random

In [3]:
import faiss
from deap import algorithms

In [4]:
load_data_func = getattr(datasets, f'cement_data')
X_train, X_test, y_train, y_test = load_data_func('./data/concrete_processed.csv')

load_data_loader_func = getattr(datasets, f'lightgbm_load_data')
train_loader, val_loader = load_data_loader_func(X_train, X_test, y_train, y_test)

train_func = getattr(surrogate, f'lightgbm_train')
model = train_func(train_loader, val_loader)




Training until validation scores don't improve for 50 rounds
[100]	train's rmse: 0.0448095	valid's rmse: 0.0588017
[200]	train's rmse: 0.0342117	valid's rmse: 0.0521947
[300]	train's rmse: 0.0290689	valid's rmse: 0.050235
Early stopping, best iteration is:
[305]	train's rmse: 0.0288801	valid's rmse: 0.0501455


In [5]:
x_min,x_max = np.min(X_train, axis=0), np.max(X_train, axis=0)

In [6]:
n_features = X_train.shape[1]

In [7]:
creator.create('FitnessMax', base.Fitness, weights=(1.0,))
creator.create('Individual', np.ndarray, fitness=creator.FitnessMax)

In [8]:
def generate_individual():
    return np.random.uniform(x_min, x_max)


In [9]:
toolbox = base.Toolbox()
toolbox.register('attr_float', generate_individual)
# min_max 차원이 8개이기에 n을 1로 설정 하면 8개의 변수를 가진 ind 생성!
toolbox.register('individual', tools.initIterate, creator.Individual, toolbox.attr_float)
toolbox.register('population', tools.initRepeat, list, toolbox.individual)


In [10]:
predict_func = getattr(surrogate, f'lightgbm_predict')

In [11]:
gt_y = y_test[0]

In [12]:
def fitness(population):

    population = np.array(population)
    # print('population shape : ', population.shape)
    y_pred = predict_func(model=model, X_test=population)

    fit_fun = -(y_pred - gt_y)**2
    return fit_fun


In [13]:
toolbox.register('evaluate', fitness)

In [14]:
ETA_CX = 5.0 # 초기에 작게(탐색), 점점 크게(exploitation)
toolbox.register('mate', tools.cxSimulatedBinary, eta=ETA_CX)

MUTPB = 0.2          # 돌연변이 적용 확률(전역)
INDPB = 0.1          # 각 변수별 변이 확률
sigma_list = [(ub - lb)/10.0 for (lb,ub) in zip(x_min, x_max)]  # 변수 범위에 따른 sigma 값 

toolbox.register('mutate', tools.mutGaussian, mu=[0.0]*(len(x_min)), sigma=sigma_list, indpb=INDPB)


In [15]:
toolbox.register('select', tools.selTournament)

In [24]:
population = toolbox.population(n=100)


In [25]:
cxpb = 0.5
mutpb = 0.5

In [26]:
def kmeans_clustering(population, k):
    n, d = population.shape
    kmeans = faiss.Kmeans(d, k, niter=20, verbose=False)
    population = population.astype('float32')
    kmeans.train(population)

    cluster_labels = kmeans.index.search(population, 1)[1].flatten()  
    centroids = kmeans.centroids
    
    return cluster_labels, centroids


In [27]:
def k_means_selection(population, k):
    cluster_labels, centroids = kmeans_clustering(np.array(population),k=k)
    res = []
    for i in range(k):
        cluster_idx = np.where(cluster_labels == i)[0]
        cluster_population = [population[j] for j in cluster_idx]
        res.extend(toolbox.select(cluster_population, k=len(cluster_population)//2,tournsize = 3))
    return res

In [30]:
%%capture
for i in range(100):
    offspring = algorithms.varAnd(population, toolbox, cxpb, mutpb)
    population = offspring+population
    invalid_ind = [ind for ind in population if not ind.fitness.valid]
    fitness_scores = toolbox.evaluate(invalid_ind)
    for ind, fit in zip(invalid_ind, fitness_scores):
        ind.fitness.values = (fit,)
    population = k_means_selection(population, k=len(population)//10)
    # print([i.fitness.values for i in population[:3]])

