In [24]:
from random import random, sample, choice
from math import floor
from tqdm import tqdm
from numpy import array, dot, mean
from numpy.linalg import pinv

# Genetic Algorithm for Multiple Linear Regression

We have studied linear regression in the previous chapters. In this chapter, we will use a genetic algorithm to find the best fit for a multiple linear regression problem.

## Problem Statement

We will generate a dataset with four independent variables and one dependent variable. The independent variables will be generated randomly. The dependent variable will be generated using the following equation:

$$y = 0.4x_1 - 0.3x_2 + 0.2x_3 - 0.1x_4$$

In [25]:
# Generate data - 1000 data points with 4 independent variables
#
# x will be a 1000 x 4 matrix
# y will be a 1000 x 1 matrix

def generate_data(num_data_points=1000):
    coeff = [0.4, -0.3, 0.2, -0.1]
    x = [[random() for j in range(len(coeff))] for i in range(num_data_points)]
    y = [dot(i, coeff) for i in x]
    return array(x), array(y)

# Generating a coefficient of determination

We want to compare the performance of the genetic algorithm with the best possible solution. We will use the coefficient of determination to compare the performance of the genetic algorithm with the best possible solution. The coefficient of determination is defined as:
    
$$COD = 1 - \frac{SSR}{SST}$$

where $SSR$ is the sum of squared residuals and $SST$ is the total sum of squares. The $SSR$ is defined as:

$$SSR = \sum_{i=1}^{n}(y_i - \hat{y_i})^2$$

where $y_i$ is the actual value of the dependent variable and $\hat{y_i}$ is the predicted value of the dependent variable. The $SST$ is defined as:

$$SST = \sum_{i=1}^{n}(y_i - \bar{y})^2$$

where $\bar{y}$ is the mean of the dependent variable.

In [26]:
# inputs is a 1000 x 4 matrix
# outputs is a 1000 x 1 matrix
#
# These are the inputs and outputs generated in the previous step

def multiple_linear_regression(inputs, outputs):
    X, Y = array(inputs), array(outputs)
    X_t, Y_t = X.transpose(), Y.transpose()
    coeff = dot((pinv((dot(X_t, X)))), (dot(X_t, Y)))
    Y_p = dot(X, coeff)
    Y_mean = mean(Y)
    SST = array([(i - Y_mean) ** 2 for i in Y]).sum()
    SSR = array([(i - j) ** 2 for i, j in zip(Y, Y_p)]).sum()
    COD = (1 - (SSR / SST)) * 100.0
    av_error = (SSR / len(Y))
    return {'COD': COD, 'coeff': coeff, 'error': av_error}

# Test this function with the generated data

inputs, outputs = generate_data()
print(multiple_linear_regression(inputs, outputs))


{'COD': 100.0, 'coeff': array([ 0.4, -0.3,  0.2, -0.1]), 'error': 2.1958920555810363e-30}


# Terminating condition

We will terminate the algorithm when the coefficient of determination is greater than 99.9% or when the maximum number of generations is reached.

In [27]:
def check_termination_condition(best_individual):
    if ((best_individual['COD'] >= 99.9)
            or (generation_count == max_generations)):
        return True
    else:
        return False

# Creating an individual

We will create an individual as a list of random numbers. The length of the list will be equal to the number of independent variables (4 in this case).

An 'individual' is a potential solution to the problem. In this case, an individual is a set of coefficients for the multiple linear regression problem.

In [28]:
def create_individual(individual_size):
    return [random() for i in range(individual_size)]

# Twat this function

individual_size_test = 4
print(create_individual(individual_size_test))


[0.07534345334034465, 0.9686574624790423, 0.6805131319959843, 0.7853769303619385]


# Creating a population

We will create a population as a list of individuals. The length of the list will be equal to the population size (1000 in this case).

A 'population' is a collection of individuals. In this case, a population is a collection of possible sets of coefficients for the multiple linear regression problem.

In [29]:
def create_population(individual_size, population_size):
    return [create_individual(individual_size) for i in range(population_size)]

# test this function

individual_size_test = 4
population_size_test = 3
print(create_population(individual_size_test, population_size_test))

[[0.07240200837911603, 0.8504505475910018, 0.9541438156731749, 0.7781262208958655], [0.36020532649772374, 0.3703028622582675, 0.3419847551080487, 0.7439211582733876], [0.7393500842479124, 0.9836237958580762, 0.2158781487118585, 0.31278908782579096]]


# Evaluating the fitness

We will evaluate the fitness of an individual using the coefficient of determination. The fitness will be a dictionary with the following keys:

- COD: The coefficient of determination
- error: The average error

In [30]:
def get_fitness(individual, inputs):
    predicted_outputs = dot(array(inputs), array(individual))
    output_mean = mean(outputs)
    SST = array(
        [(i - output_mean) ** 2 for i in outputs]
        ).sum()
    SSR = array(
        [(i - j) ** 2 for i, j in zip(outputs, predicted_outputs)]
        ).sum()
    COD = (1 - (SSR / SST)) * 100.0
    average_error = (SSR / len(outputs))
    return {'COD': COD, 'error': average_error, 'coeff': individual}


# Evaluating the (current) population

We will evaluate the fitness of all the individuals in the population.

The steps are as follows:

- We will use the `get_fitness` function to evaluate the fitness of each individual in the population.  Remember that the `get_fitness` function returns a dictionary with the following keys:
    - COD: The coefficient of determination
    - error: The average error
- So, we will get a list of dictionaries. We will sort this list based on the error. 
- The individuals with the least error will be the best individuals
- We will store the best individual in a list called `best_individuals_stash`
- We will return the best individuals (the individuals with the least error) as the output of this function ... we will use these individuals to create the next generation.  The variable called selection_size will determine the number of best individuals that will be returned.

In [32]:
# tqdm is used to display a progress bar

def evaluate_population(population):
    fitness_list = [get_fitness(individual, inputs)
                    for individual in tqdm(population)]
    error_list = sorted(fitness_list, key=lambda i: i['error'])
    best_individuals = error_list[: selection_size]
    best_individuals_stash.append(best_individuals[0]['coeff'])
    print('Error: ', best_individuals[0]['error'],
          'COD: ', best_individuals[0]['COD'])
    return best_individuals

# Crossover and mutation

We will use the following steps to create the next generation:

- We will select two parents from the best individuals
- We will create a child by randomly selecting half the genes from the first parent and the other half from the second parent
-   We will mutate the child by randomly changing some of the genes
-  We will repeat the above steps until we have the required number of children

In [33]:
def crossover(parent_1, parent_2):
    child = {}
    loci = [i for i in range(0, individual_size)]
    loci_1 = sample(loci, floor(0.5*(individual_size)))
    loci_2 = [i for i in loci if i not in loci_1]
    chromosome_1 = [[i, parent_1['coeff'][i]] for i in loci_1]
    chromosome_2 = [[i, parent_2['coeff'][i]] for i in loci_2]
    child.update({key: value for (key, value) in chromosome_1})
    child.update({key: value for (key, value) in chromosome_2})
    return [child[i] for i in loci]

# Mutation

We will mutate the child by randomly changing some of the genes. The steps are as follows:

- We will randomly select some genes to mutate
- We will randomly change the value of the selected genes
- We will return the mutated child

In [34]:
def mutate(individual):
    loci = [i for i in range(0, individual_size)]
    no_of_genes_mutated = floor(probability_of_gene_mutating*individual_size)
    loci_to_mutate = sample(loci, no_of_genes_mutated)
    for locus in loci_to_mutate:
        gene_transform = choice([-1, 1])
        change = gene_transform*random()
        individual[locus] = individual[locus] + change
    return individual

# Creating the next generation

We will create the next generation using the following steps:

- We will select the best individuals from the current population
- We will create a new population by crossing over the selected individuals
- We will mutate some of the individuals in the new population
- We will return the new population

In [35]:
def get_new_generation(selected_individuals):
    parent_pairs = [sample(selected_individuals, 2)
                    for i in range(population_size)]
    offspring = [crossover(pair[0], pair[1]) for pair in parent_pairs]
    offspring_indices = [i for i in range(population_size)]
    offspring_to_mutate = sample(
        offspring_indices,
        floor(probability_of_individual_mutating*population_size)
    )
    mutated_offspring = [[i, mutate(offspring[i])]
                         for i in offspring_to_mutate]
    for child in mutated_offspring:
        offspring[child[0]] = child[1]
    return offspring

# Main Program

We will use the following steps to create the next generation:

- We will create an initial population
- We will set some parameters for the genetic algorithm: population size (1000), selection size (10% of population), maximum number of generations (100), probability of individual mutating (10%), and probability of gene mutating (25%)
- We will run the genetic algorithm until the termination condition is met

In [36]:
# Generate initial population
inputs, outputs = generate_data()

# Set parameters
individual_size = len(inputs[0])
population_size = 1000
selection_size = floor(0.1*population_size)
max_generations = 100
probability_of_individual_mutating = 0.1
probability_of_gene_mutating = 0.25

# Run the genetic algorithm
best_possible = multiple_linear_regression(inputs, outputs)
best_individuals_stash = [create_individual(individual_size)]
initial_population = create_population(individual_size, 1000)
current_population = initial_population
termination = False
generation_count = 0
while termination is False:
    current_best_individual = get_fitness(best_individuals_stash[-1], inputs)
    print('Generation: ', generation_count)
    best_individuals = evaluate_population(current_population)
    current_population = get_new_generation(best_individuals)
    termination = check_termination_condition(current_best_individual)
    generation_count += 1
else:
    print(get_fitness(best_individuals_stash[-1], inputs))

Generation:  0


100%|██████████| 1000/1000 [00:00<00:00, 1476.71it/s]


Error:  0.03214711738316933 COD:  -26.958090408859235
Generation:  1


100%|██████████| 1000/1000 [00:00<00:00, 1489.01it/s]


Error:  0.005505603037824798 COD:  78.256811026005
Generation:  2


100%|██████████| 1000/1000 [00:00<00:00, 1473.05it/s]


Error:  0.0029317062411471635 COD:  88.42185998889448
Generation:  3


100%|██████████| 1000/1000 [00:00<00:00, 1458.12it/s]


Error:  0.0035157855530253183 COD:  86.11516501529806
Generation:  4


100%|██████████| 1000/1000 [00:00<00:00, 1439.73it/s]


Error:  0.0020291939812862258 COD:  91.98613705040539
Generation:  5


100%|██████████| 1000/1000 [00:00<00:00, 1438.28it/s]


Error:  0.001008875711328637 COD:  96.01566348100555
Generation:  6


100%|██████████| 1000/1000 [00:00<00:00, 1455.71it/s]


Error:  0.00045948867755275547 COD:  98.18534880215631
Generation:  7


100%|██████████| 1000/1000 [00:00<00:00, 1431.22it/s]


Error:  4.1083613607104885e-05 COD:  99.83774914968319
Generation:  8


100%|██████████| 1000/1000 [00:00<00:00, 1367.64it/s]


Error:  1.4492579385871958e-05 COD:  99.94276469078088
Generation:  9


100%|██████████| 1000/1000 [00:00<00:00, 1477.28it/s]

Error:  1.4492579385871958e-05 COD:  99.94276469078088
{'COD': 99.94276469078088, 'error': 1.4492579385871958e-05, 'coeff': [0.40476285457624106, -0.3105118815735305, 0.20517931408381418, -0.09761093583429781]}



