# XGBoost hyperparameter tuning using Genetic Algorithm

In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
import warnings
warnings.filterwarnings('ignore')

## Initialize Population

In [2]:
def init_population(numberOfParents):
    learningRate = np.empty([numberOfParents, 1])
    nEstimators = np.empty([numberOfParents, 1], dtype=np.uint8)
    
    for i in range(numberOfParents):
        learningRate[i] = round(random.uniform(0.01,1), 2)
        nEstimators[i] = random.randrange(10, 1500, step=25)
        
    population = np.concatenate((learningRate, nEstimators), axis=1)
    
    return population

## Set up fitness metric

In [3]:
def fitness_f1score(y_true, y_pred):
    fitness = round((f1_score(y_true, y_pred, average='weighted')), 4)
    return fitness

## Obtain fitness of initial population

In [4]:
def train_population(population, train, test, y_test):
    fScore = []
    for i in range(population.shape[0]):
        param = {
#             'objective': 'binary:logistic',
            "learning_rate": population[i][0],
#             "n_estimators": population[i][1]
        }
        num_round = 100
        model = xgb.train(
            params=param, 
            dtrain=train, 
            num_boost_round=num_round
        )
        preds = model.predict(test) > 0.5
        fScore.append(fitness_f1score(y_test, preds))
        
    return fScore

## Select best parents for mating

In [5]:
def new_parents_selection(population, fitness, numParents):
    """
    Selection of mating candidates from parents
    """
    selectedParents = np.empty(shape=(numParents, population.shape[1]))
    
    for parentId in range(numParents):
        bestFitnessId = np.where(fitness==np.max(fitness))[0][0]
        selectedParents[parentId, :] = population[bestFitnessId, :]
        fitness[bestFitnessId] = -1
        
    return selectedParents

## Crossover

In [6]:
def crossover_uniform(parents, childrenSize):
    """
    Mate parents to create children with similar parameters
    using uniform crossover.
    """
    
    # Obtain all index of children
    crossoverPointIndex = np.arange(
        start=0, 
        stop=np.uint8(childrenSize[1]),
        step=1,
        dtype=np.uint8
    )
    
    # Sample half of index
    crossoverPointIndex1 = np.random.randint(
        low=0,
        high=np.uint8(childrenSize[1]),
        size=np.uint8(childrenSize[1]/2)
    )
    
    # Select leftover index
    crossoverPointIndex2 = np.array(
        list(set(crossoverPointIndex) - set(crossoverPointIndex1))
    )
    
    # Create children array
    children = np.empty(childrenSize)
    for i in range(childrenSize[0]):
        
        # Find parent_1 index
        parent1_index = i % parents.shape[0] 
        
        # Find parent_2 index
        parent2_index = (i+1) % parents.shape[0]
        
        # Insert parameters based on randomly selected indexes in parent_1
        children[i, crossoverPointIndex1] = parents[parent1_index, crossoverPointIndex1] 
        
        # Insert parameters based on randomly selected indexes in parent_2
        children[i, crossoverPointIndex2] = parents[parent2_index, crossoverPointIndex2]
    
    return children

## Mutation

In [7]:
def mutation(crossover, numberOfParameters):
    
    minMaxValue = np.zeros(shape=(numberOfParameters, 2))
    minMaxValue[0, :] = [0.01, 1.0] # learning_rate
    minMaxValue[1, :] = [10, 2000] # n_estimators
    
    mutationValue = 0
    parameterSelect = np.random.randint(0, 2, 1)

    if parameterSelect == 0:
        mutationValue = round(np.random.uniform(-0.5, 0.5), 2)
    if parameterSelect == 1:
        mutationValue = np.random.randint(-200, 200, 1)
        
    for idx in range(crossover.shape[0]):
        crossover[idx, parameterSelect] = crossover[idx, parameterSelect] + mutationValue
        if (crossover[idx, parameterSelect] > minMaxValue[parameterSelect, 1]):
            crossover[idx, parameterSelect] = minMaxValue[parameterSelect, 1]
        if (crossover[idx, parameterSelect] < minMaxValue[parameterSelect, 0]):
            crossover[idx, parameterSelect] = minMaxValue[parameterSelect, 0]
    
    return crossover

## Implementation

In [8]:
data = pd.read_csv(r'C:\Users\xBaka\Notebooks\genetic_algorithms\clean2.data', header=None)
data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,159,160,161,162,163,164,165,166,167,168
0,MUSK-211,211_1+1,46,-108,-60,-69,-117,49,38,-161,...,-308,52,-7,39,126,156,-50,-112,96,1.0
1,MUSK-211,211_1+10,41,-188,-145,22,-117,-6,57,-171,...,-59,-2,52,103,136,169,-61,-136,79,1.0
2,MUSK-211,211_1+11,46,-194,-145,28,-117,73,57,-168,...,-134,-154,57,143,142,165,-67,-145,39,1.0
3,MUSK-211,211_1+12,41,-188,-145,22,-117,-7,57,-170,...,-60,-4,52,104,136,168,-60,-135,80,1.0
4,MUSK-211,211_1+13,41,-188,-145,22,-117,-7,57,-170,...,-60,-4,52,104,137,168,-60,-135,80,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6593,NON-MUSK-jp13,jp13_2+5,51,-123,-23,-108,-117,134,-160,82,...,-66,164,-14,-29,107,171,-44,-115,118,0.0
6594,NON-MUSK-jp13,jp13_2+6,44,-104,-19,-105,-117,142,-165,68,...,-51,166,-9,150,129,158,-66,-144,-5,0.0
6595,NON-MUSK-jp13,jp13_2+7,44,-102,-19,-104,-117,72,-165,65,...,90,117,-8,150,130,159,-66,-144,-6,0.0
6596,NON-MUSK-jp13,jp13_2+8,51,-121,-23,-106,-117,63,-161,79,...,86,99,-14,-31,106,171,-44,-116,117,0.0


In [9]:
X = data.iloc[:, 2:168]
X

Unnamed: 0,2,3,4,5,6,7,8,9,10,11,...,158,159,160,161,162,163,164,165,166,167
0,46,-108,-60,-69,-117,49,38,-161,-8,5,...,-244,-308,52,-7,39,126,156,-50,-112,96
1,41,-188,-145,22,-117,-6,57,-171,-39,-100,...,-235,-59,-2,52,103,136,169,-61,-136,79
2,46,-194,-145,28,-117,73,57,-168,-39,-22,...,-238,-134,-154,57,143,142,165,-67,-145,39
3,41,-188,-145,22,-117,-7,57,-170,-39,-99,...,-236,-60,-4,52,104,136,168,-60,-135,80
4,41,-188,-145,22,-117,-7,57,-170,-39,-99,...,-236,-60,-4,52,104,137,168,-60,-135,80
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6593,51,-123,-23,-108,-117,134,-160,82,-230,-28,...,62,-66,164,-14,-29,107,171,-44,-115,118
6594,44,-104,-19,-105,-117,142,-165,68,-225,-32,...,60,-51,166,-9,150,129,158,-66,-144,-5
6595,44,-102,-19,-104,-117,72,-165,65,-219,-12,...,-226,90,117,-8,150,130,159,-66,-144,-6
6596,51,-121,-23,-106,-117,63,-161,79,-224,-30,...,-238,86,99,-14,-31,106,171,-44,-116,117


In [10]:
y = data.iloc[:, 168]
y

0       1.0
1       1.0
2       1.0
3       1.0
4       1.0
       ... 
6593    0.0
6594    0.0
6595    0.0
6596    0.0
6597    0.0
Name: 168, Length: 6598, dtype: float64

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [12]:
train = xgb.DMatrix(X_train, y_train)
test = xgb.DMatrix(X_test, y_test)

In [13]:
numberOfParents = 8 
numberOfParentsMating = 4
numberOfParameters = 2
numberOfGenerations = 4

populationSize = (numberOfParents, numberOfParameters)
population = init_population(numberOfParents)
fitnessHistory = np.empty([numberOfGenerations+1, numberOfParents])
populationHistory = np.empty([(numberOfGenerations+1)*numberOfParents, numberOfParameters])
populationHistory[0:numberOfParents, :] = population

for generation in range(numberOfGenerations):
    print("Generation", generation)
    
    fitnessValue = train_population(
        population=population, 
        train=train,
        test=test,
        y_test=y_test
    )
    
    fitnessHistory[generation, :] = fitnessValue
    
    print('Best F1 Score in generation =', np.max(fitnessHistory[generation, :]))
    
    parents = new_parents_selection(
        population=population,
        fitness=fitnessValue,
        numParents=numberOfParentsMating
    )
    
    children = crossover_uniform(
        parents=parents,
        childrenSize=(
            populationSize[0]-parents.shape[0], 
            numberOfParameters
        )
    )
    
    children_mutated = mutation(children, numberOfParameters)
    
    population[0:parents.shape[0], :] = parents
    population[parents.shape[0]:, :] = children_mutated
    populationHistory[(generation+1)*numberOfParents:(generation+1)*numberOfParents+numberOfParents, :] = population

Generation 0
Best F1 Score in this generation = 0.9847
Generation 1
Best F1 Score in this generation = 0.9847
Generation 2
Best F1 Score in this generation = 0.9847
Generation 3
Best F1 Score in this generation = 0.9847


In [14]:
fitness = train_population(
    population=population,
    train=train,
    test=test,
    y_test=y_test
)

fitnessHistory[generation+1, :] = fitness

bestFitnessIndex = np.where(fitness==np.max(fitness))[0][0]

print(bestFitnessIndex)

print('learning_rate', population[bestFitnessIndex][0])

0
learning_rate 0.26
