In [8]:
"""
Dependencies: gym, ale-py

pip install gym[atari,accept-rom-license]

"""

import gym
import random
import numpy as np

from copy import copy, deepcopy

from functools import cmp_to_key

import torch
from torch import nn
from torchvision import transforms

import pickle
import time

In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [10]:
transform_obs = transforms.Compose([
    transforms.ToTensor(),
    transforms.Grayscale(),
    transforms.Resize(size=(84,84)),
    transforms.Normalize([0.5], [0.5]),

])

def obs_to_tensor(obs):
   return torch.squeeze(transform_obs(np.array(obs)).to(device), 0)

# Initialize weights to something random.
def init_weights_cnn(m):
    if (isinstance(m, nn.Linear) or isinstance(m, nn.Conv2d)):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

# CNN used for determining actions.
class Agent_CNN(nn.Module):
  def __init__(self):
    super().__init__() 
    self.model = nn.Sequential(
            # Block 1
                nn.Conv2d(in_channels=4, out_channels=32, kernel_size=(8,8), stride=(4), bias=True),
                nn.ReLU(),
                nn.Conv2d(in_channels=32, out_channels=64, kernel_size=(4,4), stride=(2), bias=True),
                nn.ReLU(),
                nn.Conv2d(in_channels=64, out_channels=64, kernel_size=(3,3), stride=(1), bias=True),
                nn.ReLU(),
                nn.Flatten(), 
                nn.Linear(in_features=3136, out_features=512, bias=True),
                nn.ReLU(),
                nn.Linear(in_features=512, out_features=7, bias=True),
                nn.ReLU(),
                nn.Softmax(),
            )
    
    self.model.apply(init_weights_cnn)
    
  def forward(self, x):
    with torch.no_grad():
      arr = self.model(x)
      arr = arr.cpu().detach().numpy()
      action = np.argmax(arr)
      return action

  def getWeights(self):
    weight_dict = {}
    weight_dict["l1"] = self.model[0].weight.cpu().detach().numpy()
    weight_dict["l2"] = self.model[2].weight.cpu().detach().numpy()
    weight_dict["l3"] = self.model[4].weight.cpu().detach().numpy()
    weight_dict["l4"] = self.model[7].weight.cpu().detach().numpy()
    weight_dict["l5"] = self.model[9].weight.cpu().detach().numpy()

    return weight_dict
    
  def setWeights(self, weight_dict):

    self.model[0].weight = nn.Parameter(torch.from_numpy(weight_dict["l1"]).float().to(device), requires_grad = False)
    self.model[2].weight = nn.Parameter(torch.from_numpy(weight_dict["l2"]).float().to(device), requires_grad = False)
    self.model[4].weight = nn.Parameter(torch.from_numpy(weight_dict["l3"]).float().to(device), requires_grad = False)
    self.model[7].weight = nn.Parameter(torch.from_numpy(weight_dict["l4"]).float().to(device), requires_grad = False)
    self.model[9].weight = nn.Parameter(torch.from_numpy(weight_dict["l5"]).float().to(device), requires_grad = False)

In [11]:
# Syntactic sugar for restacking frames
def restack(t2, t3, t4, obs):
    return t2, t3, t4, obs

"""
Action space is of size 7:

0 NOOP
1 FIRE
2 UP
3 RIGHT
4 LEFT
5 RIGHTFIRE
6 LEFTFIRE
"""

"""
A class for each chromosome.
"""
class Chromosome:

    """
    This is used to initialize a chromosome

    args is used since Python doesn't support multiple constructors. (self, env) initializes a chromosome with a random
    action sequence and is used during population initialization.

    (self, actions, env) is used during crossover to generate new chromosomes given a crossed action buffer.
    """
    def __init__(self, *args):

        if len(args) > 1:
            self.nn = Agent_CNN()
            self.nn.to(device)
            self.nn.eval()
            self.nn.setWeights(args[0])
            self.env = args[1]
            self.reward_sum = 0
            self.time_alive = 0
            self.updated = False

            return

        # Stores actions taken, rewards obtained, total reward for the run, and the number of frames that the agent has been alive for.

        self.nn = Agent_CNN()
        self.nn.to(device)
        self.nn.eval()
        self.updated = False

        self.reward_sum = 0
        self.time_alive = 0

        # Store the provided env as a class variable.
        self.env = args[0]

        obs = self.env.reset()
        frame1 = obs_to_tensor(obs)
        frame2 = deepcopy(frame1)
        frame3 = deepcopy(frame1)
        frame4 = deepcopy(frame1)
        done = False

        while(done == False):

            stacked_obs = torch.unsqueeze(torch.stack([frame1, frame2, frame3, frame4]), 0)

            action = self.nn(stacked_obs)

            obs, reward, done, info = self.env.step(action)
            frame1, frame2, frame3, frame4 = restack(frame2, frame3, frame4, obs_to_tensor(obs))

            self.time_alive = self.time_alive + 1
            self.reward_sum = self.reward_sum + reward

    """
    This reruns the environment and performs a new run with the updated NN to generate new stats.

    Runs a specified number of times and averages all results.

    If we've already updated, don't update again. Done so elites aren't recalculated when they don't need to be.
    """
    def update(self, num):

        if self.updated:
            return

        # Stores updated actions and rewards taken.
        self.nn.eval()

        times = []
        scores = []

        for i in range(0, num):
            curReward = 0
            curTime = 0

            obs = self.env.reset()
            frame1 = obs_to_tensor(obs)
            frame2 = deepcopy(frame1)
            frame3 = deepcopy(frame1)
            frame4 = deepcopy(frame1)

            done = False

            while(done == False):

                stacked_obs = torch.unsqueeze(torch.stack([frame1, frame2, frame3, frame4]), 0)

                action = self.nn(stacked_obs)

                obs, reward, done, info = self.env.step(action)
                frame1, frame2, frame3, frame4 = restack(frame2, frame3, frame4, obs_to_tensor(obs))

                curTime = curTime + 1
                curReward = curReward + reward

            times.append(curTime)
            scores.append(curReward)

        self.reward_sum = np.mean(np.array(scores))
        self.time_alive = np.mean(np.array(times))

        self.updated = True

    """
    This mutates a chromosome's genes (NN weights) based on a supplied probability. If the mutation occurs, the action is replaced with a random one.
    """
    def mutate(self, mutation_rate, mutation_factor=0.01):
        weight_dict = self.nn.getWeights()
        for key, val in weight_dict.items():
            std = np.std(val)
            for i in range(0, len(val)):
                for j in range(0, len(val[i])):
                    if random.random() <= mutation_rate:
                        curWeight = val[i][j]
                        mutation_amount = np.random.normal(0, std)
                        val[i][j] = curWeight + mutation_amount

            weight_dict[key] = val

        self.nn.setWeights(weight_dict)





In [15]:
"""
GA and helper methods
"""

#Comparator for a Chromosome list
def chromosome_comparator(a, b):
    if a.reward_sum > b.reward_sum:
        return 1
    elif a.reward_sum == b.reward_sum:
        if a.time_alive < b.time_alive:
            return 1
        else:
            return -1
    else:
        return -1

# Feed this to sorting function
compare_key = cmp_to_key(chromosome_comparator)

# Generates initial population
def create_initial_population(number, env):
    population = []

    for i in range(0, number):
        population.append(Chromosome(env))

    return population

def list_population(population):
    outString = ""

    for i, chrom in enumerate(population):
        outString = outString + f"Chromosome {i+1}: Score - {chrom.reward_sum}   Time: {chrom.time_alive} * "

    print(outString)


"""
Applies a fitness function and then ranks the population in descending order of best to worst.

In this case, the primary marker of fitness is the high score. Assuming a tie, then secondary criterion is how long each agent has been alive.

Also returns the best chromosome found.
"""
def selection(population):

    # The first stage is the fitness function.
    optimal_index = 0
    longest_alive = 1000000
    max_score = 0
    n = len(population)
    scores = []

    for i, chrom in enumerate(population):

        chrom.update(5)

        if chrom.reward_sum > max_score:
            best = True
        elif (chrom.reward_sum == max_score) and longest_alive < chrom.time_alive:
            best = True
        else:
            best = False

        if best:
            longest_alive = chrom.time_alive
            max_score = chrom.reward_sum
            optimal_index = i

    selected_population = []

    # Select best chromosome.
    best_chromosome = deepcopy(population[optimal_index])
    population.sort(key=compare_key, reverse=True)

    for l in range(0, int(len(population) / 10)):  
        selected_population.append(population.pop(l))

    scores = []
    for l in range(0, len(population)):
        scores.append(population[l].reward_sum)

    scores = [1 if x <=0 else x for x in scores]

    scores = np.array(scores)

    # Select chromosomes probabilistically.
    for k in range(0, int(len(population)/2) - int(len(selected_population) / 2)):
        probs = scores / np.sum(scores)

        ind = np.random.choice(np.arange(0, len(population)), size=None, replace=True, p=probs)
        selected_population.append(population.pop(ind))

        scores = np.delete(scores, ind)
    
    selected_population.sort(key=compare_key, reverse=True)
    list_population(selected_population)

    return selected_population, best_chromosome

"""
Performs crossover on a population and returns the new population.

Crossover is done by combining the weights of the parental nns.
"""
def crossover(population):

    n = len(population) * 2
    scores = []
    for i in range(0, len(population)):
        scores.append(population[i].reward_sum)
    
    scores = [1 if x <=0 else x for x in scores]
    scores = np.array(scores)

    crossed_population = []

    i = 0

    scores_new = deepcopy(scores)

    while len(crossed_population) < n:

        if i < int(n / 10):
            crossed_population.append(population[i])
            i = i + 1
            continue

        if np.count_nonzero(scores_new) < 2:
            scores_new = deepcopy(scores)


        probs = scores_new / np.sum(scores_new)

        father_index = np.random.choice(np.arange(0, len(population)), size=None, replace=True, p=probs)
        father = population[father_index]
        scores_new[father_index] = 0

        probs = scores_new / np.sum(scores_new)

        mother_index = np.random.choice(np.arange(0, len(population)), size=None, replace=True, p=probs)
        mother = population[mother_index]
        scores_new[mother_index] = 0

        motherWeights = mother.nn.getWeights()
        fatherWeights = father.nn.getWeights()

        newWeights = deepcopy(fatherWeights)
        newWeights2 = deepcopy(fatherWeights)
            
        for key, val in newWeights.items():
            for k in range(0, len(val)):
                
                crossover_point = random.randint(0, len(val[k]))

                for l in range(0, len(val[k])):
                    if l >= crossover_point:
                        
                        newWeights[key][k][l] = motherWeights[key][k][l]
                        newWeights2[key][k][l] = fatherWeights[key][k][l]

                        newWeights[key][k][l] = fatherWeights[key][k][l]
                        newWeights2[key][k][l] = motherWeights[key][k][l]

        crossed_population.append(Chromosome(newWeights, father.env))
        crossed_population.append(Chromosome(newWeights2, father.env))

        i = i+1


    return crossed_population[0:n]

"""
Mutate the population. Rate represents chance of mutation.
"""
def mutate_population(population, rate=0.001):
    for i, chrom in enumerate(population):

        # Don't mutate the best chromosomes retained from previous population
        if i < int(len(population) / 10):
            continue 
        
        else:
            chrom.mutate(rate)

"""
Rerun the environment for all chromosomes and produce updated runs.
"""
def update_population(population):
    for i, chrom in enumerate(population):   
         chrom.update()

In [16]:
"""
GA loop
"""

env = gym.make('ALE/Assault-v5', full_action_space=False)

n_iter = 100
population_size = 200
mutation_rate = 0.002

best_alive = 10000000
best_score = 0
best_chromo = None

population = create_initial_population(population_size, env)

start_time = time.time()

for i in range(0, n_iter):

    # Run the fitness function and determine if the best chromosome is better than the best one we've stored for all runs.
    selected_population, best_chromo_candidate = selection(population)
    longest_alive = best_chromo_candidate.time_alive
    max_score = best_chromo_candidate.reward_sum

    if best_chromo_candidate.reward_sum > best_score:
            best = True
    elif (best_chromo_candidate.reward_sum == best_score) and best_chromo_candidate.time_alive < best_alive:
            best = True
    else:
        if i == 0:
                best = True
        else:
                best = False

    if best:
           best_chromo = deepcopy(best_chromo_candidate)
           best_alive = best_chromo_candidate.time_alive
           best_score = best_chromo_candidate.reward_sum

           file = open(f'best_chromo_{start_time}.pickle', 'wb')

           # dump information to that file
           pickle.dump(best_chromo, file)

           # close the file
           file.close()
           

    # Print stats.
    print(f"Best Chromosome for Run {i + 1} -> Time Alive: {longest_alive}    High Score: {max_score}")

    # Cross, Mutate, and prepare the new population for the next run.
    crossed_population = crossover(selected_population)
    mutate_population(crossed_population, mutation_rate)

    population = crossed_population

print(f"Best Chromosome -> Time Alive: {best_alive}    High Score: {best_score}")

  input = module(input)


Chromosome 1: Score - 483.0   Time: 673.6 * Chromosome 2: Score - 474.6   Time: 624.6 * Chromosome 3: Score - 453.6   Time: 619.0 * Chromosome 4: Score - 453.6   Time: 639.4 * Chromosome 5: Score - 449.4   Time: 858.2 * Chromosome 6: Score - 445.2   Time: 641.4 * Chromosome 7: Score - 445.2   Time: 1276.8 * Chromosome 8: Score - 428.4   Time: 592.4 * Chromosome 9: Score - 428.4   Time: 713.8 * Chromosome 10: Score - 424.2   Time: 635.0 * Chromosome 11: Score - 424.2   Time: 648.6 * Chromosome 12: Score - 420.0   Time: 593.4 * Chromosome 13: Score - 420.0   Time: 607.8 * Chromosome 14: Score - 399.0   Time: 570.4 * Chromosome 15: Score - 399.0   Time: 583.8 * Chromosome 16: Score - 399.0   Time: 2385.6 * Chromosome 17: Score - 378.0   Time: 586.8 * Chromosome 18: Score - 373.8   Time: 573.6 * Chromosome 19: Score - 361.2   Time: 582.2 * Chromosome 20: Score - 352.8   Time: 550.4 * Chromosome 21: Score - 340.2   Time: 481.2 * Chromosome 22: Score - 340.2   Time: 2105.0 * Chromosome 23: S

In [17]:
"""
This displays the game to evaluate our best chromosome.
"""
test_env = gym.make('ALE/Assault-v5', render_mode="human", full_action_space=False)

file = open("cnn_s722.pickle",'rb')
best_chromo = pickle.load(file)

done = False
obs = test_env.reset()
frame1 = obs_to_tensor(obs)
frame2 = deepcopy(frame1)
frame3 = deepcopy(frame1)
frame4 = deepcopy(frame1)

time_alive = 0
reward_sum = 0

while(done == False):

    stacked_obs = torch.unsqueeze(torch.stack([frame1, frame2, frame3, frame4]), 0)

    action = best_chromo.nn(stacked_obs)
    
    obs, reward, done, info = test_env.step(action)

    frame1, frame2, frame3, frame4 = restack(frame2, frame3, frame4, obs_to_tensor(obs))

    reward_sum = reward_sum + reward
    time_alive = time_alive + 1

file.close()
print(f"Longest Alive: {time_alive}    High Score: {reward_sum}")

Longest Alive: 1489    High Score: 714.0
