In [1]:
"""
Dependencies: gym, ale-py

pip install gym[atari,accept-rom-license]

"""

import gym
import random
import numpy as np

from copy import copy, deepcopy

from functools import cmp_to_key

import torch
from torch import nn
from torchvision import transforms

import pickle
import time

  import distutils.spawn


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
transform_obs = transforms.Compose([
    transforms.ToTensor(),
    transforms.Grayscale(),
    transforms.Resize(size=(84,84)),
    transforms.Normalize([0.5], [0.5]),

])

def obs_to_tensor(obs):
   return torch.unsqueeze(torch.flatten(transform_obs(np.array(obs))), 0).to(device)


# Initialize weights to something random.
def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_normal_(m.weight)
        m.bias.data.fill_(0.01)
  
class Agent_MLP(nn.Module):
    def __init__(self):
      super().__init__() 
      self.model = nn.Sequential(
        nn.Linear(7056, 1024),
        nn.ReLU(),     
        nn.Linear(1024, 128),
        nn.ReLU(),     
        nn.Linear(128, 7),
        nn.ReLU(),     
        nn.Softmax()
      )

      # Manually initialize random weights
      self.model.apply(init_weights)
    
    def forward(self, x):
      with torch.no_grad():
        arr = self.model(x)
        arr = arr.cpu().detach().numpy()
        action = np.argmax(arr)
        return action
    
    def getWeights(self):
      weight_dict = {}
      weight_dict["l1"] = self.model[0].weight.cpu().detach().numpy()
      weight_dict["l2"] = self.model[2].weight.cpu().detach().numpy()
      weight_dict["l3"] = self.model[4].weight.cpu().detach().numpy()

      return weight_dict
    
    def setWeights(self, weight_dict):

      self.model[0].weight = nn.Parameter(torch.from_numpy(weight_dict["l1"]).float().to(device), requires_grad = False)
      self.model[2].weight = nn.Parameter(torch.from_numpy(weight_dict["l2"]).float().to(device), requires_grad = False)
      self.model[4].weight = nn.Parameter(torch.from_numpy(weight_dict["l3"]).float().to(device), requires_grad = False)


In [17]:
"""
Action space is of size 7:

0 NOOP
1 FIRE
2 UP
3 RIGHT
4 LEFT
5 RIGHTFIRE
6 LEFTFIRE
"""

"""
A class for each chromosome.
"""
class Chromosome:

    """
    This is used to initialize a chromosome

    args is used since Python doesn't support multiple constructors. (self, env) initializes a chromosome with a random
    action sequence and is used during population initialization.

    (self, actions, env) is used during crossover to generate new chromosomes given a crossed action buffer.
    """
    def __init__(self, *args):

        if len(args) > 1:
            self.nn = Agent_MLP()
            self.nn.to(device)
            self.nn.eval()
            self.nn.setWeights(args[0])
            self.env = args[1]
            self.reward_sum = 0
            self.time_alive = 0

            return

        # Stores actions taken, rewards obtained, total reward for the run, and the number of frames that the agent has been alive for.

        self.nn = Agent_MLP()
        self.nn.to(device)
        self.nn.eval()

        self.reward_sum = 0
        self.time_alive = 0

        # Store the provided env as a class variable.
        self.env = args[0]

        obs = self.env.reset()
        done = False

        while(done == False):

            action = self.nn(obs_to_tensor(obs))

            obs, reward, done, _ = self.env.step(action)

            self.time_alive = self.time_alive + 1
            self.reward_sum = self.reward_sum + reward

    """
    This reruns the environment and performs a new run with the updated action buffer to generate new stats.
    """
    def update(self, num):

        # Stores updated actions and rewards taken.
        self.nn.eval()

        times = []
        scores = []

        for i in range(0, num):
            curReward = 0
            curTime = 0

            obs = self.env.reset()

            done = False

            while(done == False):

                action = self.nn(obs_to_tensor(obs))

                obs, reward, done, info = self.env.step(action)

                curTime = curTime + 1
                curReward = curReward + reward

            times.append(curTime)
            scores.append(curReward)

        self.reward_sum = np.mean(np.array(scores))
        self.time_alive = np.mean(np.array(times))

    """
    Since the environment has a certain level of randomness, this function reruns it without saving results
    except the aggregate reward and time alive. Used to gauge how accurate a score is during fitness.
    """
    def simulate(self):

        # This is needed in case we manage to survive for more frames than the action buffer has actions, in which case we do random ones.
        counter = 0

        new_reward_sum = 0
        new_time_alive = 0

        obs = self.env.reset()

        done = False

        while(done == False):

            # Determine if action is from supplied buffer or random.
            counter = counter + 1

            action = self.nn(obs_to_tensor(obs))

            obs, reward, done, _ = self.env.step(action)

            new_time_alive = self.time_alive + 1
            new_reward_sum = self.reward_sum + reward

        return new_reward_sum, new_time_alive

    """
    This mutates a chromosome's genes (NN weights) based on a supplied probability. If the mutation occurs, the action is replaced with a random one.
    """
    def mutate(self, mutation_rate, mutation_factor=0.01):
        weight_dict = self.nn.getWeights()
        for key, val in weight_dict.items():
            std = np.std(val)
            for i in range(0, len(val)):
                for j in range(0, len(val[i])):
                    if random.random() <= mutation_rate:
                        curWeight = val[i][j]
                        mutation_amount = np.random.normal(0, std)
                        val[i][j] = curWeight + mutation_factor * mutation_amount

            weight_dict[key] = val

        self.nn.setWeights(weight_dict)





In [18]:
"""
GA and helper methods
"""

#Comparator for a Chromosome list
def chromosome_comparator(a, b):
    if a.reward_sum > b.reward_sum:
        return 1
    elif a.reward_sum == b.reward_sum:
        if a.time_alive < b.time_alive:
            return -1
        else:
            return 1
    else:
        return -1

# Feed this to sorting function
compare_key = cmp_to_key(chromosome_comparator)

# Generates initial population
def create_initial_population(number, env):
    population = []

    for i in range(0, number):
        population.append(Chromosome(env))

    return population

def list_population(population):
    outString = ""

    for i, chrom in enumerate(population):
        outString = outString + f"Chromosome {i+1}: Score - {chrom.reward_sum}   Time: {chrom.time_alive} * "

    print(outString)


"""
Applies a fitness function and then ranks the population in descending order of best to worst.

In this case, the primary marker of fitness is the high score. Assuming a tie, then secondary criterion is how long each agent has been alive.

Also returns the best chromosome found.
"""
def selection(population):

    # The first stage is the fitness function.
    optimal_index = 0
    longest_alive = 1000000
    max_score = 0
    n = len(population)
    scores = []

    for i, chrom in enumerate(population):

        chrom.update(5)

        if chrom.reward_sum > max_score:
            best = True
        elif (chrom.reward_sum == max_score) and longest_alive < chrom.time_alive:
            best = True
        else:
            best = False

        if best:
            longest_alive = chrom.time_alive
            max_score = chrom.reward_sum
            optimal_index = i

    selected_population = []

    # Select best chromosome.
    best_chromosome = deepcopy(population[optimal_index])
    population.sort(key=compare_key, reverse=True)

    for l in range(0, int(len(population) / 10)):  
        selected_population.append(population.pop(l))

    scores = []
    for l in range(0, len(population)):
        scores.append(population[l].reward_sum)

    scores = [1 if x <=0 else x for x in scores]

    scores = np.array(scores)

    # Select chromosomes probabilistically.
    for k in range(0, int(len(population)/2) - int(len(selected_population) / 2)):
        probs = scores / np.sum(scores)

        ind = np.random.choice(np.arange(0, len(population)), size=None, replace=True, p=probs)
        selected_population.append(population.pop(ind))

        scores = np.delete(scores, ind)
    
    selected_population.sort(key=compare_key, reverse=True)
    list_population(selected_population)

    return selected_population, best_chromosome

"""
Performs crossover on a population and returns the new population.

Crossover is done by combining the weights of the parental nns.
"""
def crossover(population):
    crossed_population = []
    father = population[0]

    for i in range(0, len(population) * 2):

        if i == 0:
            crossed_population.append(father)
            continue
            
        else:
            father_index = random.randint(0, len(population)-1)
            mother_index = None
            got_index = False

            while(not got_index):
                mother_index = random.randint(0, len(population)-1)
                if mother_index != father_index:
                    got_index = True
            
            father = population[father_index]
            mother = population[mother_index]

        motherWeights = mother.nn.getWeights()
        fatherWeights = father.nn.getWeights()

        newWeights = deepcopy(fatherWeights)

        for key, val in newWeights.items():
            for k in range(0, len(val)):

                crossover_point = random.randint(0, len(val[k]))

                for l in range(0, len(val[k])):
                    if l >= crossover_point:
                        val[k][l] = motherWeights[key][k][l]

            newWeights[key] = val

        crossed_population.append(Chromosome(newWeights, father.env))

    return crossed_population

"""
Mutate the population. Rate represents chance of mutation.
"""
def mutate_population(population, rate=0.001):
    for i, chrom in enumerate(population):

        # Don't mutate the best chromosome retained from previous population
        if i == 0:
            continue 
        
        else:
            chrom.mutate(rate)

"""
Rerun the environment for all chromosomes and produce updated runs.
"""
def update_population(population):
    for i, chrom in enumerate(population): 
         
         # Don't update best chromosome, as there is no need
         if i == 0:
            continue 
             
         chrom.update()

In [20]:
"""
GA loop
"""

env = gym.make('ALE/Assault-v5', full_action_space=False)

n_iter = 100
population_size = 20
mutation_rate = 0.002

best_alive = 0
best_score = 0
best_chromo = None

population = create_initial_population(population_size, env)

start_time = time.time()

for i in range(0, n_iter):

    # Run the fitness function and determine if the best chromosome is better than the best one we've stored for all runs.
    selected_population, best_chromo_candidate = selection(population)
    longest_alive = best_chromo_candidate.time_alive
    max_score = best_chromo_candidate.reward_sum

    if best_chromo_candidate.reward_sum > best_score:
            best = True
    elif (best_chromo_candidate.reward_sum == best_score) and best_chromo_candidate.time_alive > best_alive:
            best = True
    else:
        if i == 0:
                best = True
        else:
                best = False

    if best:
           best_chromo = deepcopy(best_chromo_candidate)
           best_alive = best_chromo_candidate.time_alive
           best_score = best_chromo_candidate.reward_sum

           file = open(f'best_chromo_{start_time}.pickle', 'wb')

           # dump information to file
           pickle.dump(best_chromo, file)

           # close the file
           file.close()
           

    # Print stats.
    print(f"Best Chromosome for Run {i + 1} -> Longest Alive: {longest_alive}    High Score: {max_score}")

    # Cross, Mutate, Update, and prepare the new population for the next run.
    crossed_population = crossover(selected_population)
    mutate_population(crossed_population, mutation_rate)

    population = crossed_population

print(f"Best Chromosome -> Longest Alive: {best_alive}    High Score: {best_score}")

Chromosome 1: Score - 554.4   Time: 2323.8 * Chromosome 2: Score - 46.2   Time: 338.4 * Chromosome 3: Score - 21.0   Time: 347.2 * Chromosome 4: Score - 0.0   Time: 4027.6 * Chromosome 5: Score - 0.0   Time: 3656.4 * Chromosome 6: Score - 0.0   Time: 3406.8 * Chromosome 7: Score - 0.0   Time: 2888.4 * Chromosome 8: Score - 0.0   Time: 1742.8 * Chromosome 9: Score - 0.0   Time: 357.6 * Chromosome 10: Score - 0.0   Time: 336.2 * 
Best Chromosome for Run 1 -> Longest Alive: 2323.8    High Score: 554.4
Chromosome 1: Score - 525.0   Time: 2245.4 * Chromosome 2: Score - 432.6   Time: 605.8 * Chromosome 3: Score - 415.8   Time: 611.4 * Chromosome 4: Score - 407.4   Time: 593.4 * Chromosome 5: Score - 399.0   Time: 651.0 * Chromosome 6: Score - 0.0   Time: 3438.8 * Chromosome 7: Score - 0.0   Time: 3061.2 * Chromosome 8: Score - 0.0   Time: 2894.8 * Chromosome 9: Score - 0.0   Time: 519.8 * Chromosome 10: Score - 0.0   Time: 300.8 * 
Best Chromosome for Run 2 -> Longest Alive: 2245.4    High S

In [21]:
"""
This displays the game to evaluate our best chromosome.
"""
test_env = gym.make('ALE/Assault-v5', render_mode="human", full_action_space=False)

file = open("mlp_s684.pickle",'rb')
best_chromo = pickle.load(file)

done = False
obs = test_env.reset()

time_alive = 0
reward_sum = 0

while(done == False):

    action = best_chromo.nn(obs_to_tensor(obs))
    
    obs, reward, done, info = test_env.step(action)

    reward_sum = reward_sum + reward
    time_alive = time_alive + 1

print(f"Longest Alive: {time_alive}    High Score: {reward_sum}")

file.close()

  input = module(input)


Longest Alive: 1735    High Score: 630.0
