# Genetski algoritam

Definisanje jedinke i metoda unutar klase jedinke

Glavni metod je metod merge() koji se koristi i kao metod za merenje fitnesa i kao metod za dobijanje aproksimativne niske scs za taj niz

In [1]:
import random

class Individual:
    def __init__(self, arr):
        self.arr = random.sample(arr, len(arr))
        self.scs, self.fitness = self.merge()
    
    def __lt__(self, other):
        return self.fitness < other.fitness
    
    # Metod koji ce lepiti sve niske redom jednu uz drugu uzimajuci poklapanja u obzir
    # Primer merge() za [bla, lad, dub] ce vratiti (bladub, 6) 
    # Za fitness se uzima upravo duzina ove spojene niske
    def merge(self):
        n = len(self.arr)
        solution = self.arr[0]
        i = 1
        while i < n:
            string = self.arr[i]
            len1 = len(solution)
            len2 = len(string)
            max_len = 0
            
            for j in range(1, min(len1, len2)+1):
                if solution.endswith(string[:j]):
                    max_len = j
            solution += string[max_len:]
            i+=1
        return solution, len(solution)
    
    def get_fitness(self):
        return self.fitness

## Pomocne funkcije genetskog algoritma: selection, crossover i mutation

Turnirska selekcija

In [2]:
def selection(population, tournament_size):
    participants = random.sample(population, tournament_size)
    winner = min(participants)
    return winner

Ukrstanje prvog reda

In [3]:
def crossover(parent1, parent2, child1, child2):
    
    n = len(parent1.arr)
    segment_size = int(n/2)
    start_index = int(random.random() * n/2)
    end_index = start_index + segment_size
    
    child1.arr[start_index:end_index] = parent1.arr[start_index:end_index]
    child2.arr[start_index:end_index] = parent2.arr[start_index:end_index]
    
    if(end_index == n):
        free_index1 = 0
        free_index2 = 0
        
        for i in range(n):
            if not(contains(child1.arr[start_index:end_index], parent2.arr[i])):
                child1.arr[free_index1] = parent2.arr[i]
                free_index1 += 1
            if not(contains(child2.arr[start_index:end_index], parent1.arr[i])):
                child2.arr[free_index2] = parent1.arr[i]
                free_index2 += 1
    else:
        free_index1 = end_index
        free_index2 = end_index
        
        for i in range(end_index, n):
            if not(contains(child1.arr[start_index:end_index], parent2.arr[i])):
                child1.arr[free_index1] = parent2.arr[i]
                free_index1 += 1
                if free_index1 == n:
                    free_index1 = 0
            if not(contains(child2.arr[start_index:end_index], parent1.arr[i])):
                child2.arr[free_index2] = parent1.arr[i]
                free_index2 += 1
                if free_index2 == n:
                    free_index2 = 0
                    
        for i in range(end_index):
            if not(contains(child1.arr[start_index:end_index], parent2.arr[i])):
                child1.arr[free_index1] = parent2.arr[i]
                free_index1 += 1
                if free_index1 == n:
                    free_index1 = 0
            if not(contains(child2.arr[start_index:end_index], parent1.arr[i])):
                child2.arr[free_index2] = parent1.arr[i]
                free_index2 += 1
                if free_index2 == n:
                    free_index2 = 0

Mutacija bazirana na swap-u

In [4]:
def mutation(child, chance):
    n = len(child.arr)
    for i in range(n):
        if random.random() < chance:
            random_index = int(random.random() * n)
            temp = child.arr[i]
            child.arr[i] = child.arr[random_index]
            child.arr[random_index] = temp

In [5]:
def contains(arr1, member):
    for element in arr1:
        if element == member:
            return 1
    return 0

## Glavni genetski algoritam

In [6]:
from copy import deepcopy
import time

def gp(arr, pop_size, num_iter, elite_size, mut_chance, tour_size):
    start = time.time()
    if (pop_size - elite_size) % 2 == 1:
        elite_size -= 1
        
    population = [Individual(arr=arr) for _ in range(pop_size)]
    new_population = [Individual(arr=arr) for _ in range(pop_size)]
    
    for iteration in range(num_iter):
        population.sort(reverse=False)
        
        new_population[:elite_size] = population[:elite_size]
        
        for i in range(elite_size, pop_size, 2):
            parent1 = selection(population, tour_size)
            parent2 = selection(population, tour_size)
            
            crossover(parent1, parent2, new_population[i], new_population[i+1])
            
            mutation(new_population[i], mut_chance)
            mutation(new_population[i+1], mut_chance)
            
            new_population[i].scs, new_population[i].fitness = new_population[i].merge()
            new_population[i+1].scs, new_population[i+1].fitness = new_population[i+1].merge()
        
        population = deepcopy(new_population)
        
    best = min(population)
    end = time.time()
    delta = end-start
    if delta < 0.0001:
        print("Execution time: <0.0001 seconds.")
    else:
        print("Execution time: " + str(round(delta, 3)) + " seconds.")
    return best.scs, len(best.scs)

Pomocne funkcije za stampanje rezultata

In [7]:
def print_solution(sol):
    if len(sol) == 0:
        print("Shortest superstring not found")
    else:
        print("Found shortest superstring is: " + sol)
        
def print_size(size):
    if size == 0:
        print("Shortest superstring not found")
    else:
        print("The size of the found shortest superstring is: " + str(size))

Parametri za genetski algoritam, odabrani eksperimentalno

In [8]:
# POPULATION_SIZE = 12
# NO_OF_ITERATIONS = 2000
# ELITISM_SIZE = 8
# MUTATION_CHANCE = 0.005
# TOURNAMENT_SIZE = 4

POPULATION_SIZE = 30
NO_OF_ITERATIONS = 500
ELITISM_SIZE = 15
MUTATION_CHANCE = 0.005
TOURNAMENT_SIZE = 12

## Primeri

### Primer 1

Dat je niz koji sadrzi 4 niske od po 3 karaktera

In [9]:
arr = ["AAB", "BAA", "ABA", "BAB"]

solution, size = gp(arr, POPULATION_SIZE, NO_OF_ITERATIONS, ELITISM_SIZE, MUTATION_CHANCE, TOURNAMENT_SIZE)

print_solution(solution)
print_size(size)

Execution time: 0.168 seconds.
Found shortest superstring is: BAABAB
The size of the found shortest superstring is: 6


### Primer 2

Dat je niz koji sadrzi 5 niski od po 4 karaktera

In [10]:
arr = ["bloa", "bubl", "gabl", "abpo", "ublm"]

solution, size = gp(arr, POPULATION_SIZE, NO_OF_ITERATIONS, ELITISM_SIZE, MUTATION_CHANCE, TOURNAMENT_SIZE)

print_solution(solution)
print_size(size)

Execution time: 0.177 seconds.
Found shortest superstring is: gabloabpobublm
The size of the found shortest superstring is: 14


### Primer 3

Dat je niz koji sadrzi 20 niski od po 4 karaktera

In [11]:
arr = ["wobj" , "bfqp", "pzlb", "rfcs", "atha", 
       "npjp", "tfgu", "izjx", "dven", "tksn", 
       "fqws", "cusc", "qlpy", "fepk", "cbzj", 
       "ecrx", "cpsp", "zqdp", "liqu", "rdyu"]

solution, size = gp(arr, POPULATION_SIZE, NO_OF_ITERATIONS, ELITISM_SIZE, MUTATION_CHANCE, TOURNAMENT_SIZE)

print_size(size)
# print_solution(solution)

Execution time: 0.366 seconds.
The size of the found shortest superstring is: 76


Kod ovog primera se moze primetiti da genetski algoritam u vecini slucajeva (ali ne uvek) dobija optimalno resenje

### Slozeniji primeri

Pomocna funkcija za ucitavanje niski iz .txt fajla u niz

In [12]:
def LoadStringsFromTxt(filename):
    with open(filename) as f:
        data = f.read()
    data = data.split("\n")
    data.pop()
    return data

### Primer 4
U fajlu "test1.txt" se nalazi 200 slucajno generisanih stringova duzine 8

Ovi stringovi se sastoje iskljucivo od malih slova bez brojeva, npr. "dasdfbnm"

In [13]:
arr = []
arr = LoadStringsFromTxt("data/test1.txt")

solution, size = gp(arr, POPULATION_SIZE, NO_OF_ITERATIONS, ELITISM_SIZE, MUTATION_CHANCE, TOURNAMENT_SIZE)

print_size(size)
# print_solution(solution)

Execution time: 5.208 seconds.
The size of the found shortest superstring is: 1485


### Primer 5
U fajlu "test2.txt" se nalazi 300 slucajno generisanih stringova duzine 32

Kao i do sad, ovi stringovi se iskljucivo sastoje od malih slova bez brojeva

In [14]:
arr = []
arr = LoadStringsFromTxt("data/test2.txt")

solution, size = gp(arr, POPULATION_SIZE, NO_OF_ITERATIONS, ELITISM_SIZE, MUTATION_CHANCE, TOURNAMENT_SIZE)

print_size(size)
#print_solution(solution)

Execution time: 14.371 seconds.
The size of the found shortest superstring is: 9441


### Primer 6

U datoteci DNA_Sequence je dato 500 nasumicno generisanih niski duzine 10 koje se sastoje od malih slova a, c, t, g kao u sekvenci DNK

In [15]:
arr = []
arr = LoadStringsFromTxt("data/DNA_Sequence5.txt")

solution, size = gp(arr, POPULATION_SIZE, NO_OF_ITERATIONS, ELITISM_SIZE, MUTATION_CHANCE, TOURNAMENT_SIZE)

print_size(size)
#print_solution(solution)

Execution time: 21.489 seconds.
The size of the found shortest superstring is: 4060
