In [1]:
import gym
import numpy as np
import time

from fastprogress.fastprogress import master_bar, progress_bar

In [2]:
env = gym.make('FrozenLake8x8-v0')

In [3]:
seed = 0
env.seed(seed)
np.random.seed(seed)

In [4]:
state_dim = env.observation_space.n
action_dim = env.action_space.n
state_dim, action_dim

(64, 4)

In [5]:
env.reset()
env.render()


[41mS[0mFFFFFFF
FFFFFFFF
FFFHFFFF
FFFFFHFF
FFFHFFFF
FHHFFFHF
FHFFHFHF
FFFHFFFG


In [6]:
dna_size = 64
pop_size = 1500
mutation_rate = 0.4
cross_rate = 0.5
generations = 300

In [7]:
def evaluate_policy(env, policy, times=100):
    total_rewards = 0.0
    for _ in range(times):
        obs = env.reset()
        while True:
            action = policy[obs]
            obs, reward, done, _ = env.step(action)
            total_rewards += reward
            if done:
                break
    return total_rewards / times

In [8]:
def select(pop, ranks):
    return [pop[x] for x in ranks[:keeps]]


def crossover(policy1, policy2):
    new_policy = policy1.copy()
    rand = np.random.uniform(size=dna_size) < cross_rate
    new_policy[rand] = policy2[rand]
    return new_policy


def mutation(policy):
    new_policy = policy.copy()
    rand = np.random.uniform(size=dna_size) < mutation_rate
    new_policy[rand] = np.random.choice(4)
    return new_policy

In [9]:
keeps = 25

best_score = 0.0
policy_pop = np.random.choice(4, size=(pop_size, dna_size))

In [10]:
mb = master_bar(range(generations))
mb.write(('generation', 'score', 'time'), table=True)
for generation in mb:
    progress_bar(np.array([0]), total=1, parent=mb)
    start_time = time.time()

    # step decay mutation rate
    if generation % 10 == 0:
        if mutation_rate > 0.001:
            mutation_rate /= 2
            mutation_rate = max(mutation_rate, 0.005)

    # get_fitness
    trn_tqdm = progress_bar(policy_pop, total=int(len(policy_pop)), parent=mb)
    policy_scores = [evaluate_policy(env, policy) for policy in trn_tqdm]
    policy_ranks = np.argsort(policy_scores)[::-1]
    score = np.mean(np.array(policy_scores)[policy_ranks[:50]])

    # checkpoint
    if score > best_score:
        print(
            f'generation: {generation+1}, '
            f'improve score from {best_score:.4f} to {score:.4f}, '
            'save as checkpoint3.npy'
        )
        np.save(f'checkpoint3', policy_pop)
        best_score = score

    # keeps
    elite_set = select(policy_pop, policy_ranks)

    # select
    select_probs = np.array(policy_scores) / np.sum(policy_scores)

    # crossover
    child_set = [
        crossover(policy_pop[np.random.choice(pop_size, p=select_probs)],
                  policy_pop[np.random.choice(pop_size, p=select_probs)])
        for _ in range(pop_size - keeps)
    ]
    keeps += 1

    # mutation
    mutated_list = [mutation(c) for c in child_set]

    # sum
    policy_pop = np.array(elite_set + mutated_list)

    # calculate time
    end_time = time.time()
    m, s = divmod(end_time - start_time, 60)

    # print
    mb.write((f'{generation+1}', f'{score:.4f}', f'{m:.0f}:{s:.0f}'),
             table=True)
env.close()

generation,score,time
1,0.0436,0:38
2,0.1676,0:45
3,0.3256,0:46
4,0.457,0:48
5,0.558,0:52
6,0.622,0:52
7,0.6636,0:54
8,0.7088,0:54
9,0.738,0:55
10,0.7666,0:57


generation: 1, improve score from 0.0000 to 0.0436, save as checkpoint3.npy
generation: 2, improve score from 0.0436 to 0.1676, save as checkpoint3.npy
generation: 3, improve score from 0.1676 to 0.3256, save as checkpoint3.npy
generation: 4, improve score from 0.3256 to 0.4570, save as checkpoint3.npy
generation: 5, improve score from 0.4570 to 0.5580, save as checkpoint3.npy
generation: 6, improve score from 0.5580 to 0.6220, save as checkpoint3.npy
generation: 7, improve score from 0.6220 to 0.6636, save as checkpoint3.npy
generation: 8, improve score from 0.6636 to 0.7088, save as checkpoint3.npy
generation: 9, improve score from 0.7088 to 0.7380, save as checkpoint3.npy
generation: 10, improve score from 0.7380 to 0.7666, save as checkpoint3.npy
generation: 11, improve score from 0.7666 to 0.7774, save as checkpoint3.npy
generation: 12, improve score from 0.7774 to 0.7978, save as checkpoint3.npy
generation: 13, improve score from 0.7978 to 0.8138, save as checkpoint3.npy
generati

In [11]:
policy_pop = np.load('checkpoint3.npy')

In [12]:
policy_pop = np.unique(policy_pop, axis=0);policy_pop.shape

(1500, 64)

In [13]:
test_tqdm = progress_bar(policy_pop, total=int(len(policy_pop)))
policy_score = [evaluate_policy(env, pp, times=100) for pp in test_tqdm]
index = np.argmax(policy_score)
best_policy = policy_pop[index]
print(f'index: {index}')
print('Best actions score =', (np.max(policy_score)), '\n'
      'best actions =\n', best_policy)

index: 289
Best actions score = 0.97 
best actions =
 [3 2 2 1 2 2 2 2 3 3 3 3 3 3 3 2 3 3 2 1 2 3 2 2 0 1 2 0 0 0 2 2 2 2 3 2 3
 2 3 2 3 0 1 2 2 2 3 2 1 0 1 2 3 1 1 2 2 1 3 2 1 0 2 1]


In [14]:
index = np.argsort(policy_score)[-200:][::-1]
for i in index[:10]:
    print(i, policy_score[i])

1065 0.97
289 0.97
1253 0.97
687 0.97
63 0.96
489 0.96
853 0.96
484 0.96
891 0.95
333 0.95


In [15]:
# Testing: Calculating the average reward of 1000 eposides
test_episodes = 1000  # DON'T CHANGE THIS VALUE
tests = 100
best_score = 0.0

rewards_list = []

mb = master_bar(range(tests))
for test in mb:
    
    index_tqdm = progress_bar(index, total=int(len(index)), parent=mb)
    for idx in index_tqdm:        
        steps = 0
        total_reward = 0
        
        for i in range(test_episodes):
            
            state = env.reset()
            done = False
            while not done:
                action = policy_pop[idx][state]
                next_state, reward, done, _ = env.step(action)
                state = next_state
                steps = steps + 1
                total_reward = total_reward + reward

        total_avg_reward = total_reward / test_episodes
        total_avg_step = steps / test_episodes
        rewards_list.append(total_avg_reward)
        if total_avg_reward > best_score:
            print("The average results of {} episodes are steps: {}, reward: {}".format(
                test_episodes, total_avg_step, total_avg_reward))
            best_score = total_avg_reward
    
total_avg_reward = max(rewards_list)
total_avg_reward

The average results of 1000 episodes are steps: 110.418, reward: 0.897
The average results of 1000 episodes are steps: 107.621, reward: 0.9
The average results of 1000 episodes are steps: 108.164, reward: 0.905
The average results of 1000 episodes are steps: 107.537, reward: 0.909
The average results of 1000 episodes are steps: 107.197, reward: 0.91
The average results of 1000 episodes are steps: 108.402, reward: 0.912
The average results of 1000 episodes are steps: 106.493, reward: 0.913
The average results of 1000 episodes are steps: 105.353, reward: 0.917
The average results of 1000 episodes are steps: 106.63, reward: 0.921
The average results of 1000 episodes are steps: 105.029, reward: 0.924


0.924

In [16]:
total_avg_reward = max(rewards_list)
total_avg_reward

0.924

In [17]:
# Print results in CSV format and upload to Kaggle
with open('rewards.csv', 'w') as f:
    f.write('Id,Predicted\n')
    f.write('FrozenLake8x8_public,{}\n'.format(total_avg_reward))
    f.write('FrozenLake8x8_private,{}\n'.format(total_avg_reward))