In [56]:
import pandas as pd
import numpy as np
import random

pd.set_option('display.width',1000)

## Data generation

In [57]:
def generate_weights_and_values(num_items, weight_range, value_range):
    weights = np.round(np.random.uniform(*weight_range, num_items), 1)
    values = np.random.randint(*value_range, num_items)
    return pd.DataFrame({
        'Name': [str(i) for i in range(num_items)],
        'Weight': weights,
        'Value': values
    })

df_addresses = pd.read_csv("full_addresses.csv")
df = generate_weights_and_values(len(df_addresses), (2, 10), (1, 5 + 1))
df = df.join(df_addresses)
df_20 = df.head(20)

print(df_20.head(4))

  Name  Weight  Value       address_line_1                          city  postcode   latitude  longitude
0    0     4.3      3    15 St Marys Close            North Lincolnshire  DN17 4NP  53.609854  -0.825200
1    1     4.6      4           Park Place  Bath and North East Somerset  BS39 4EQ  51.362625  -2.575737
2    2     4.4      4  128 Pershore Avenue                      Bradford   BD7 3JE  53.785587  -1.779740
3    3     7.4      4         6 Gala Drive                        Slough   SL1 5UB  51.510894  -0.644514


## Genetic Algorithm

### Pure python implementation

In [58]:
def genetic_algorithm(df, max_weight=30, population_size=100, num_generations=50, mutation_rate=0.5):
    weights = df["Weight"].tolist()
    values = df["Value"].tolist()
    n = len(values)
    
    # Normalise mutations, so that longer lists dont become more mutated.
    mutation_rate /= n

    average_weight = sum(weights) / n
    expected_items = max_weight / average_weight
    bit_flip_probability = expected_items / n

    # Initialise the population so that each individual has roughly the correct weight
    population = [[1 if random.random() < bit_flip_probability else 0 for _ in range(n)]
                  for _ in range(population_size)]
    
    def get_total_value(individual):
        return sum(value if individual[i] else 0 for i, value in enumerate(values))
    
    def get_total_weight(individual):
        return sum(weight if individual[i] else 0 for i, weight in enumerate(weights))

    def fitness(individual):
        total_weight = get_total_weight(individual)
        total_value = get_total_value(individual)
        if total_weight > max_weight:
            return 0
        return total_value

    def tournament_selection(population, k=3):
        return max(random.sample(population, k), key=fitness)

    def crossover(parent1, parent2):
        crossover_point = random.randint(1, len(parent1) - 1)
        return parent1[:crossover_point] + parent2[crossover_point:], parent2[:crossover_point] + parent1[crossover_point:]

    def mutate(individual):
        for i in range(len(individual)):
            if random.random() < mutation_rate:
                individual[i] = 1 - individual[i] # Flip bit
        return individual

    for _ in range(num_generations):
        new_population = []
        for _ in range(population_size // 2):
            parent1 = tournament_selection(population)
            parent2 = tournament_selection(population)
            offspring1, offspring2 = crossover(parent1, parent2)
            new_population.append(mutate(offspring1))
            new_population.append(mutate(offspring2))
        population = new_population

    best = max(population, key=fitness)
    return df[[bool(bit) for bit in best]], get_total_value(best), get_total_weight(best)


In [59]:
def show_genetic_algorithm(df, n_runs=4,
                           population_size=100, num_generations=50, mutation_rate=0.2):
    for _ in range(n_runs):
        items, value, weight = genetic_algorithm(df, population_size=population_size, num_generations=num_generations, mutation_rate=mutation_rate)
        print(f"Items: {', '.join(items['Name'].tolist())}")
        print(f"Total weight: {weight:.1f}")
        print(f"Total value: {value}")
        print()


In [60]:
show_genetic_algorithm(df_20)

Items: 2, 6, 8, 9, 16, 17, 19
Total weight: 28.4
Total value: 29

Items: 1, 2, 6, 8, 11, 16, 17, 19
Total weight: 29.9
Total value: 30

Items: 1, 2, 6, 8, 11, 16, 17, 19
Total weight: 29.9
Total value: 30

Items: 1, 6, 8, 9, 16, 17, 19
Total weight: 28.6
Total value: 29



In [61]:
ga_time_python = %timeit -o genetic_algorithm(df)

435 ms ± 28.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Numpy implementation

In [62]:
# Rewrite using numpy
def genetic_algorithm(df, max_weight=30, population_size=100, num_generations=50, mutation_rate=0.5):
    weights = df["Weight"].to_numpy()
    values = df["Value"].to_numpy()
    n = len(values)
    
    # Normalise mutations, so that longer lists dont become more mutated.
    mutation_rate /= n

    average_weight = np.mean(weights)
    expected_items = max_weight / average_weight
    bit_flip_probability = expected_items / n

    # Initialise the population so that each individual has roughly the correct weight
    population = np.random.rand(population_size, n) < bit_flip_probability
    
    def fitness(individual):
        total_weight = np.dot(weights, individual)
        total_value = np.dot(values, individual)
        return total_value if total_weight <= max_weight else 0

    def tournament_selection(population, k=3):
        return max(random.sample(list(population), k), key=fitness)

    def crossover(parent1, parent2):
        crossover_point = random.randint(1, n - 1)
        offspring1 = np.concatenate([parent1[:crossover_point], parent2[crossover_point:]])
        offspring2 = np.concatenate([parent2[:crossover_point], parent1[crossover_point:]])
        return offspring1, offspring2

    def mutate(individual):
        mutation_mask = np.random.rand(n) < mutation_rate
        individual[mutation_mask] = 1 - individual[mutation_mask] # Flip bit
        return individual

    for _ in range(num_generations):
        new_population = []
        for _ in range(population_size // 2):
            parent1 = tournament_selection(population)
            parent2 = tournament_selection(population)
            offspring1, offspring2 = crossover(parent1, parent2)
            new_population.append(mutate(offspring1))
            new_population.append(mutate(offspring2))
        population = np.array(new_population)

    best = max(population, key=fitness)
    best_items = df.iloc[best.astype(bool)]
    return best_items, np.dot(values, best), np.dot(weights, best)

In [63]:
# Not much faster with 200 items, but gets significantly faster with thousands of items.
ga_time_numpy = %timeit -o genetic_algorithm(df)
print(f"{ga_time_python.best/ga_time_numpy.best:.2f}x faster")

200 ms ± 3.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
2.14x faster


### Mutation rate tests

In [64]:
n_runs = 10
for mutation_rate in (0, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100):
    _, values, weights = list(zip(*[genetic_algorithm(df, mutation_rate=mutation_rate) for _ in range(n_runs)]))
    print(f"Mutation rate: {mutation_rate}".ljust(22) + 
        f"Max weight: {max(weights):.1f}  "
        f"Average value: {sum(values)/n_runs:.1f}")

Mutation rate: 0      Max weight: 30.0  Average value: 37.0
Mutation rate: 0.001  Max weight: 29.8  Average value: 40.0
Mutation rate: 0.005  Max weight: 30.0  Average value: 39.2
Mutation rate: 0.01   Max weight: 30.0  Average value: 39.0
Mutation rate: 0.05   Max weight: 30.0  Average value: 38.7
Mutation rate: 0.1    Max weight: 29.9  Average value: 40.5
Mutation rate: 0.5    Max weight: 29.8  Average value: 48.1
Mutation rate: 1      Max weight: 29.9  Average value: 44.2
Mutation rate: 5      Max weight: 616.8  Average value: 290.8
Mutation rate: 10     Max weight: 662.1  Average value: 309.3
Mutation rate: 50     Max weight: 617.8  Average value: 315.6
Mutation rate: 100    Max weight: 645.8  Average value: 316.9


- Somewhere between the mutation rates of 1 and 5, every member of the population becomes overweight.
- ~0.5 looks optimal.

### Actual Usage

In [66]:
chosen_items, total_value, total_weight = genetic_algorithm(df_20)
print(f"Total weight: {total_weight:.1f}. Total value: {total_value}")
print()
print(chosen_items)

Total weight: 28.8. Total value: 29

   Name  Weight  Value       address_line_1                          city  postcode   latitude  longitude
1     1     4.6      4           Park Place  Bath and North East Somerset  BS39 4EQ  51.362625  -2.575737
2     2     4.4      4  128 Pershore Avenue                      Bradford   BD7 3JE  53.785587  -1.779740
8     8     2.9      3     5 Stryd Llewelyn                     Guildford  GU12 5EH  51.261701  -0.721138
9     9     5.5      4       4 Queens Grove                     Maidstone  ME15 9DU  51.244976   0.551896
11   11     2.4      1          4 Viewlands                        Sefton   PR8 3TF  53.595472  -3.044394
16   16     2.4      4        203 Grangeway                    Nottingham   NG5 2JR  52.980847  -1.147772
17   17     4.2      5        49 Kingscroft                  East Suffolk  IP17 3DL  52.272931   1.622348
19   19     2.4      4          2 Lych Gate                     Sheffield   S35 2EP  53.466005  -1.475101
