In [37]:
from random import random, sample, choice
from math import floor
from tqdm import tqdm
from numpy import array, dot, mean
from numpy.linalg import pinv

# Genetic Algorithm for Multiple Linear Regression

We have studied linear regression in the previous chapters. In this chapter, we will use a genetic algorithm to find the best fit for a multiple linear regression problem.

## Problem Statement

We will generate a dataset with four independent variables and one dependent variable. The independent variables will be generated randomly. The dependent variable will be generated using the following equation:

$$y = 0.4x_1 - 0.3x_2 + 0.2x_3 - 0.1x_4$$

In [38]:
# Generate data - 1000 data points with 4 independent variables
#
# x will be a 1000 x 4 matrix
# y will be a 1000 x 1 matrix

def generate_data(num_data_points=1000):
    coeff = [0.4, -0.3, 0.2, -0.1]
    x = [[random() for j in range(len(coeff))] for i in range(num_data_points)]
    y = [dot(i, coeff) for i in x]
    return array(x), array(y)

# Generating a coefficient of determination

We want to compare the performance of the genetic algorithm with the best possible solution. We will use the coefficient of determination to compare the performance of the genetic algorithm with the best possible solution. The coefficient of determination is defined as:
    
$$COD = 1 - \frac{SSR}{SST}$$

where $SSR$ is the sum of squared residuals and $SST$ is the total sum of squares. The $SSR$ is defined as:

$$SSR = \sum_{i=1}^{n}(y_i - \hat{y_i})^2$$

where $y_i$ is the actual value of the dependent variable and $\hat{y_i}$ is the predicted value of the dependent variable. The $SST$ is defined as:

$$SST = \sum_{i=1}^{n}(y_i - \bar{y})^2$$

where $\bar{y}$ is the mean of the dependent variable.

In [39]:
# inputs is a 1000 x 4 matrix
# outputs is a 1000 x 1 matrix
#
# These are the inputs and outputs generated in the previous step

def multiple_linear_regression(inputs, outputs):
    X, Y = array(inputs), array(outputs)
    X_t, Y_t = X.transpose(), Y.transpose()
    coeff = dot((pinv((dot(X_t, X)))), (dot(X_t, Y)))
    Y_p = dot(X, coeff)
    Y_mean = mean(Y)
    SST = array([(i - Y_mean) ** 2 for i in Y]).sum()
    SSR = array([(i - j) ** 2 for i, j in zip(Y, Y_p)]).sum()
    COD = (1 - (SSR / SST)) * 100.0
    av_error = (SSR / len(Y))
    return {'COD': COD, 'coeff': coeff, 'error': av_error}

# Test this function with the generated data

inputs, outputs = generate_data()
print(multiple_linear_regression(inputs, outputs))


{'COD': 100.0, 'coeff': array([ 0.4, -0.3,  0.2, -0.1]), 'error': 1.1712154761536559e-31}


# Terminating condition

We will terminate the algorithm when the coefficient of determination is greater than 99.9% or when the maximum number of generations is reached.

In [40]:
def check_termination_condition(best_individual):
    if ((best_individual['COD'] >= 99.9)
            or (generation_count == max_generations)):
        return True
    else:
        return False

# Creating an individual

We will create an individual as a list of random numbers. The length of the list will be equal to the number of independent variables (4 in this case).

An 'individual' is a potential solution to the problem. In this case, an individual is a set of coefficients for the multiple linear regression problem.

In [41]:
def create_individual(individual_size):
    return [random() for i in range(individual_size)]

# Twat this function

individual_size_test = 4
print(create_individual(individual_size_test))


[0.726559972858926, 0.8489435008408955, 0.18333288175468854, 0.2650957246691381]


# Creating a population

We will create a population as a list of individuals. The length of the list will be equal to the population size (1000 in this case).

A 'population' is a collection of individuals. In this case, a population is a collection of possible sets of coefficients for the multiple linear regression problem.

In [42]:
def create_population(individual_size, population_size):
    return [create_individual(individual_size) for i in range(population_size)]

# test this function

individual_size_test = 4
population_size_test = 3
print(create_population(individual_size_test, population_size_test))

[[0.336150347687028, 0.8296016633148237, 0.02217210111188639, 0.6891906025729413], [0.7105531358572809, 0.8398830637808755, 0.6665039054700508, 0.06527176757619568], [0.40814863410501323, 0.6146147031944521, 0.7909646137832319, 0.733381038511307]]


# Evaluating the fitness

We will evaluate the fitness of an individual using the coefficient of determination. The fitness will be a dictionary with the following keys:

- COD: The coefficient of determination
- error: The average error

In [43]:
def get_fitness(individual, inputs):
    predicted_outputs = dot(array(inputs), array(individual))
    output_mean = mean(outputs)
    SST = array(
        [(i - output_mean) ** 2 for i in outputs]
        ).sum()
    SSR = array(
        [(i - j) ** 2 for i, j in zip(outputs, predicted_outputs)]
        ).sum()
    COD = (1 - (SSR / SST)) * 100.0
    average_error = (SSR / len(outputs))
    return {'COD': COD, 'error': average_error, 'coeff': individual}


# Evaluating the (current) population

We will evaluate the fitness of all the individuals in the population.

The steps are as follows:

- We will use the `get_fitness` function to evaluate the fitness of each individual in the population.  Remember that the `get_fitness` function returns a dictionary with the following keys:
    - COD: The coefficient of determination
    - error: The average error
- So, we will get a list of dictionaries. We will sort this list based on the error. 
- The individuals with the least error will be the best individuals
- We will store the best individual in a list called `best_individuals_stash`
- We will return the best individuals (the individuals with the least error) as the output of this function ... we will use these individuals to create the next generation.  The variable called selection_size will determine the number of best individuals that will be returned.

In [44]:
# tqdm is used to display a progress bar

def evaluate_population(population):
    fitness_list = [get_fitness(individual, inputs)
                    for individual in tqdm(population)]
    error_list = sorted(fitness_list, key=lambda i: i['error'])
    best_individuals = error_list[: selection_size]
    best_individuals_stash.append(best_individuals[0]['coeff'])
    print('Error: ', best_individuals[0]['error'],
          'COD: ', best_individuals[0]['COD'])
    return best_individuals

# Crossover and mutation

We will use the following steps to create the next generation:

- We will select two parents from the best individuals
- We will create a child by randomly selecting half the genes from the first parent and the other half from the second parent
-   We will mutate the child by randomly changing some of the genes
-  We will repeat the above steps until we have the required number of children

In [45]:
def crossover(parent_1, parent_2):
    child = {}
    loci = [i for i in range(0, individual_size)]
    loci_1 = sample(loci, floor(0.5*(individual_size)))
    loci_2 = [i for i in loci if i not in loci_1]
    chromosome_1 = [[i, parent_1['coeff'][i]] for i in loci_1]
    chromosome_2 = [[i, parent_2['coeff'][i]] for i in loci_2]
    child.update({key: value for (key, value) in chromosome_1})
    child.update({key: value for (key, value) in chromosome_2})
    return [child[i] for i in loci]

# Mutation

We will mutate the child by randomly changing some of the genes. The steps are as follows:

- We will randomly select some genes to mutate
- We will randomly change the value of the selected genes
- We will return the mutated child

In [46]:
def mutate(individual):
    loci = [i for i in range(0, individual_size)]
    no_of_genes_mutated = floor(probability_of_gene_mutating*individual_size)
    loci_to_mutate = sample(loci, no_of_genes_mutated)
    for locus in loci_to_mutate:
        gene_transform = choice([-1, 1])
        change = gene_transform*random()
        individual[locus] = individual[locus] + change
    return individual

# Creating the next generation

We will create the next generation using the following steps:

- We will select the best individuals from the current population
- We will create a new population by crossing over the selected individuals
- We will mutate some of the individuals in the new population
- We will return the new population

In [47]:
def get_new_generation(selected_individuals):
    parent_pairs = [sample(selected_individuals, 2)
                    for i in range(population_size)]
    offspring = [crossover(pair[0], pair[1]) for pair in parent_pairs]
    offspring_indices = [i for i in range(population_size)]
    offspring_to_mutate = sample(
        offspring_indices,
        floor(probability_of_individual_mutating*population_size)
    )
    mutated_offspring = [[i, mutate(offspring[i])]
                         for i in offspring_to_mutate]
    for child in mutated_offspring:
        offspring[child[0]] = child[1]
    return offspring

# Main Program

We will use the following steps to create the next generation:

- We will create an initial population
- We will set some parameters for the genetic algorithm: population size (1000), selection size (10% of population), maximum number of generations (100), probability of individual mutating (10%), and probability of gene mutating (25%)
- We will run the genetic algorithm until the termination condition is met

In [48]:
# Generate initial population
inputs, outputs = generate_data()

# Set parameters
individual_size = len(inputs[0])
population_size = 1000
selection_size = floor(0.1*population_size)
max_generations = 100
probability_of_individual_mutating = 0.1
probability_of_gene_mutating = 0.25

# Run the genetic algorithm
best_possible = multiple_linear_regression(inputs, outputs)
best_individuals_stash = [create_individual(individual_size)]
initial_population = create_population(individual_size, 1000)
current_population = initial_population
termination = False
generation_count = 0
while termination is False:
    current_best_individual = get_fitness(best_individuals_stash[-1], inputs)
    print('Generation: ', generation_count)
    best_individuals = evaluate_population(current_population)
    current_population = get_new_generation(best_individuals)
    termination = check_termination_condition(current_best_individual)
    generation_count += 1
else:
    print(get_fitness(best_individuals_stash[-1], inputs))

Generation:  0


100%|██████████| 1000/1000 [00:00<00:00, 1486.25it/s]


Error:  0.043889053754070165 COD:  -80.30512364441604
Generation:  1


100%|██████████| 1000/1000 [00:00<00:00, 1530.29it/s]


Error:  0.01065633553500699 COD:  56.22161491559534
Generation:  2


100%|██████████| 1000/1000 [00:00<00:00, 1512.66it/s]


Error:  0.006917151731005501 COD:  71.58293944739077
Generation:  3


100%|██████████| 1000/1000 [00:00<00:00, 1505.38it/s]


Error:  0.0038685721815888475 COD:  84.1071218022321
Generation:  4


100%|██████████| 1000/1000 [00:00<00:00, 1511.32it/s]


Error:  0.0023203436794480056 COD:  90.46755812133193
Generation:  5


100%|██████████| 1000/1000 [00:00<00:00, 1495.53it/s]


Error:  0.001912595569100308 COD:  92.14266995819173
Generation:  6


100%|██████████| 1000/1000 [00:00<00:00, 1504.09it/s]


Error:  0.0014939000795506044 COD:  93.86275584647812
Generation:  7


100%|██████████| 1000/1000 [00:00<00:00, 1511.00it/s]


Error:  0.0007023119233588457 COD:  97.11476034804188
Generation:  8


100%|██████████| 1000/1000 [00:00<00:00, 1502.28it/s]


Error:  0.0007023119233588457 COD:  97.11476034804188
Generation:  9


100%|██████████| 1000/1000 [00:00<00:00, 1513.37it/s]


Error:  0.0007023119233588457 COD:  97.11476034804188
Generation:  10


100%|██████████| 1000/1000 [00:00<00:00, 1486.89it/s]


Error:  0.0007023119233588457 COD:  97.11476034804188
Generation:  11


100%|██████████| 1000/1000 [00:00<00:00, 1490.18it/s]


Error:  0.0007023119233588457 COD:  97.11476034804188
Generation:  12


100%|██████████| 1000/1000 [00:00<00:00, 1510.33it/s]


Error:  0.0007023119233588457 COD:  97.11476034804188
Generation:  13


100%|██████████| 1000/1000 [00:00<00:00, 1526.87it/s]


Error:  0.0006391815894017407 COD:  97.37411254856167
Generation:  14


100%|██████████| 1000/1000 [00:00<00:00, 1515.88it/s]


Error:  0.0006391815894017407 COD:  97.37411254856167
Generation:  15


100%|██████████| 1000/1000 [00:00<00:00, 1405.37it/s]


Error:  0.0006391815894017407 COD:  97.37411254856167
Generation:  16


100%|██████████| 1000/1000 [00:00<00:00, 1449.61it/s]


Error:  0.0005394682575568668 COD:  97.78375511520323
Generation:  17


100%|██████████| 1000/1000 [00:00<00:00, 1436.70it/s]


Error:  0.0005394682575568668 COD:  97.78375511520323
Generation:  18


100%|██████████| 1000/1000 [00:00<00:00, 1471.69it/s]


Error:  0.0005394682575568668 COD:  97.78375511520323
Generation:  19


100%|██████████| 1000/1000 [00:00<00:00, 1505.33it/s]


Error:  0.00015924161463548353 COD:  99.34580318871595
Generation:  20


100%|██████████| 1000/1000 [00:00<00:00, 1524.90it/s]


Error:  0.00015924161463548353 COD:  99.34580318871595
Generation:  21


100%|██████████| 1000/1000 [00:00<00:00, 1505.43it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  22


100%|██████████| 1000/1000 [00:00<00:00, 1500.16it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  23


100%|██████████| 1000/1000 [00:00<00:00, 1472.45it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  24


100%|██████████| 1000/1000 [00:00<00:00, 1508.22it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  25


100%|██████████| 1000/1000 [00:00<00:00, 1483.33it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  26


100%|██████████| 1000/1000 [00:00<00:00, 1498.26it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  27


100%|██████████| 1000/1000 [00:00<00:00, 1466.88it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  28


100%|██████████| 1000/1000 [00:00<00:00, 1495.31it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  29


100%|██████████| 1000/1000 [00:00<00:00, 1488.02it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  30


100%|██████████| 1000/1000 [00:00<00:00, 1441.28it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  31


100%|██████████| 1000/1000 [00:00<00:00, 1454.49it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  32


100%|██████████| 1000/1000 [00:00<00:00, 1501.03it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  33


100%|██████████| 1000/1000 [00:00<00:00, 1507.51it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  34


100%|██████████| 1000/1000 [00:00<00:00, 1503.04it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  35


100%|██████████| 1000/1000 [00:00<00:00, 1503.57it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  36


100%|██████████| 1000/1000 [00:00<00:00, 1520.91it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  37


100%|██████████| 1000/1000 [00:00<00:00, 1441.52it/s]


Error:  0.00015726733031985212 COD:  99.35391394862509
Generation:  38


100%|██████████| 1000/1000 [00:00<00:00, 1488.23it/s]


Error:  0.00012098016365750985 COD:  99.50298898014482
Generation:  39


100%|██████████| 1000/1000 [00:00<00:00, 1516.83it/s]


Error:  0.00012098016365750985 COD:  99.50298898014482
Generation:  40


100%|██████████| 1000/1000 [00:00<00:00, 1435.10it/s]


Error:  0.00012098016365750985 COD:  99.50298898014482
Generation:  41


100%|██████████| 1000/1000 [00:00<00:00, 1498.76it/s]


Error:  0.00011423239105639205 COD:  99.5307101969199
Generation:  42


100%|██████████| 1000/1000 [00:00<00:00, 1495.29it/s]


Error:  0.00011423239105639205 COD:  99.5307101969199
Generation:  43


100%|██████████| 1000/1000 [00:00<00:00, 1479.61it/s]


Error:  0.00011423239105639205 COD:  99.5307101969199
Generation:  44


100%|██████████| 1000/1000 [00:00<00:00, 1485.45it/s]


Error:  7.784637501385753e-05 COD:  99.68019132171786
Generation:  45


100%|██████████| 1000/1000 [00:00<00:00, 1472.67it/s]


Error:  7.1282520493407e-05 COD:  99.70715696576035
Generation:  46


100%|██████████| 1000/1000 [00:00<00:00, 1451.41it/s]


Error:  6.828476431260761e-05 COD:  99.71947235542136
Generation:  47


100%|██████████| 1000/1000 [00:00<00:00, 1509.62it/s]


Error:  4.001834747545355e-05 COD:  99.83559652185625
Generation:  48


100%|██████████| 1000/1000 [00:00<00:00, 1475.79it/s]


Error:  4.001834747545355e-05 COD:  99.83559652185625
Generation:  49


100%|██████████| 1000/1000 [00:00<00:00, 1480.86it/s]


Error:  4.001834747545355e-05 COD:  99.83559652185625
Generation:  50


100%|██████████| 1000/1000 [00:00<00:00, 1477.91it/s]


Error:  4.001834747545355e-05 COD:  99.83559652185625
Generation:  51


100%|██████████| 1000/1000 [00:00<00:00, 1493.05it/s]


Error:  4.001834747545355e-05 COD:  99.83559652185625
Generation:  52


100%|██████████| 1000/1000 [00:00<00:00, 1467.11it/s]


Error:  4.001834747545355e-05 COD:  99.83559652185625
Generation:  53


100%|██████████| 1000/1000 [00:00<00:00, 1449.88it/s]


Error:  3.129807459192248e-05 COD:  99.8714211693706
Generation:  54


100%|██████████| 1000/1000 [00:00<00:00, 1526.48it/s]


Error:  3.129807459192248e-05 COD:  99.8714211693706
Generation:  55


100%|██████████| 1000/1000 [00:00<00:00, 1439.38it/s]


Error:  3.129807459192248e-05 COD:  99.8714211693706
Generation:  56


100%|██████████| 1000/1000 [00:00<00:00, 1470.95it/s]


Error:  3.129807459192248e-05 COD:  99.8714211693706
Generation:  57


100%|██████████| 1000/1000 [00:00<00:00, 1501.42it/s]


Error:  3.129807459192248e-05 COD:  99.8714211693706
Generation:  58


100%|██████████| 1000/1000 [00:00<00:00, 1487.18it/s]


Error:  3.129807459192248e-05 COD:  99.8714211693706
Generation:  59


100%|██████████| 1000/1000 [00:00<00:00, 1505.50it/s]


Error:  2.970001406781938e-05 COD:  99.87798632573065
Generation:  60


100%|██████████| 1000/1000 [00:00<00:00, 1487.55it/s]


Error:  2.970001406781938e-05 COD:  99.87798632573065
Generation:  61


100%|██████████| 1000/1000 [00:00<00:00, 1445.58it/s]


Error:  1.5370266380925453e-05 COD:  99.93685583207628
Generation:  62


100%|██████████| 1000/1000 [00:00<00:00, 1439.70it/s]

Error:  1.5370266380925453e-05 COD:  99.93685583207628
{'COD': 99.93685583207628, 'error': 1.5370266380925453e-05, 'coeff': [0.4047252278048127, -0.2979875966258534, 0.20430589547127287, -0.10493415026627206]}



