In [None]:
import csv
import os
import random
import time
from datetime import datetime

import numpy as np
from PIL import Image

from convert import images_to_video
from convert import state_list_to_image


In [None]:
map_filepath = '../image/map.png'

In [None]:
class MyAction:
    def __init__(self, n):
        self.n = n

    def sample(self):
        return random.randint(0, self.n - 1)


class MyObservation:
    def __init__(self, n):
        self.n = n


class MyEnv:
    def __init__(self, map):
        # 上下左右
        self.action_space = MyAction(4)
        self.map = map
        # [0, 1, 2, ..., 49]
        # [50, 51, 52, ..., 99]
        self.observation_space = MyObservation(map.shape[0]*map.shape[1])
        self.reset()

    def reset(self):
        self.current_step = 0
        self.current_state = [22, 36]
        self.init_state = self.current_state[0] * 50 + self.current_state[1]
        self.reward_table = np.zeros(self.map.shape)
        for i in range(map.shape[0]):
            for j in range(map.shape[1]):
                self.reward_table[i,j] = self.map[i,j]
        for i in range(map.shape[0]):
            for j in range(map.shape[1]):
                if self.reward_table[i, j] != 255:
                    self.reward_table[i, j] = -64
                    continue
                if (i < 22) or (i > 32):
                    self.reward_table[i,j] = 128
                elif j < 26:
                    self.reward_table[i,j] = 128
        self.reward_table[self.current_state[0], self.current_state[1]] = 0
        self.current_reward = 0
        self.done = False

    def step(self, action):
        self.current_step += 1
        if action == 0: # 上
            self.current_state[0] -= 1
        if action == 1: # 下
            self.current_state[0] += 1
        if action == 2: # 左
            self.current_state[1] -= 1
        if action == 3: # 右
            self.current_state[1] += 1
        if self.current_state[0] < 0:
            self.current_state[0] = 0
            self.done = True
        if self.current_state[0] >= map.shape[0]:
            self.current_state[0] = map.shape[0]-1
            self.done = True
        if self.current_state[1] < 0:
            self.current_state[1] = 0
            self.done = True
        if self.current_state[1] >= map.shape[1]:
            self.current_state[1] = map.shape[1]-1
            self.done = True
        self.current_reward = self.reward_table[self.current_state[0], self.current_state[1]]

        if self.reward_table[self.current_state[0], self.current_state[1]] > 1:
            self.reward_table[self.current_state[0], self.current_state[1]] /= 2
        if self.current_step > 34:
            self.done = True
        next_state = self.current_state[0] * 50 + self.current_state[1]
        reward = self.current_reward
        done = self.done
        info = None
        return next_state, reward, done, info


map = 255 - np.array(Image.open(map_filepath).convert('L'))
env = MyEnv(map)
print(map[13, 26])
env.reset()

In [None]:
actions = env.action_space.n
states = env.observation_space.n
eposides = 5000000
save_interval = 10000
epsilon = 0.2
gamma = 0.9
alpha = 0.01
filename = 'rewards_%s_%s_%s_%s.csv' %(eposides, epsilon, gamma, alpha)
outputDir = '../output/'



# Create Q table with all rewards = 0
q_table = np.zeros((states, actions))
#q_table = np.load('q_table_20200629_20')



In [None]:
def save_q_table(epoch):
    np.save(outputDir+'q_table_{}'.format(epoch), q_table)

def run_test(epoch):
    env.reset()
    done = False
    test_state = env.init_state
    state_list = []
    steps = 0
    total_reward = 0
    while not done:
        state_list.append(test_state)
        action = np.argmax(q_table[test_state,:])
        next_state, reward, done, _ = env.step(action)
        test_state = next_state
        steps = steps + 1
        total_reward = total_reward + reward
    with open(outputDir+'state_list_{}_{:.0f}_{}.csv'.format(epoch, total_reward, steps), 'w') as output_state_list:
        output_csv = csv.writer(output_state_list)
        output_csv.writerow(state_list)
    state_list_to_image.state_list_to_image(epoch, total_reward, steps, map_filepath, '../output')
    images_to_video.images_to_video(epoch, total_reward, steps, 10, (640, 640), '../output')
    print('Test Epoch {}, Total reward {:.0f}, steps {}'.format(epoch, total_reward, steps))

In [None]:
# Training
print_interval = int(eposides/100)
avg_s = 0
avg_tot_r = 0
max_to_r = 0
for epoch in range(1, eposides+1):
    start_timestamp = time.time()
    env.reset()
    done = False
    state = env.init_state
    steps = 0
    total_reward = 0
    while not done:
        # epsilon-greedy
        if random.random() < epsilon:
            action = env.action_space.sample() # Explore
        else:
            action = np.argmax(q_table[state,:]) # Exploit

        # Move one step
        next_state, reward, done, _ = env.step(action)

        # Update Q table
        q_table[state, action] = q_table[state, action] + alpha*(reward + gamma*np.max(q_table[next_state, :]) - q_table[state, action])
        state = next_state

        # Update statistics
        steps = steps + 1
        total_reward = total_reward + reward
    avg_s += steps / print_interval
    avg_tot_r += total_reward / print_interval
    if total_reward > max_to_r:
        max_to_r = total_reward
    if(epoch%print_interval) == 0:
        end_timestamp = time.time()
        print("Episode {}, avg_s {:.3f}, avg_tot_r {:.3f}, max_tot_r {:.3f}, elapsed {:.3f} s".format(epoch, avg_s, avg_tot_r, max_to_r, end_timestamp-start_timestamp))
        avg_s = 0
        avg_tot_r = 0
        max_to_r = 0
    if epoch == 1:
        save_q_table(epoch)
        run_test(epoch)
    if(epoch%save_interval) == 0:
        save_q_table(epoch)
        run_test(epoch)


In [None]:
q_table # Q table after learning

In [None]:
# Testing: Calculating the average reward of 1000 eposides
test_episodes = 1000 # DON'T CHANGE THIS VALUE
test_steps = 0
test_total_reward = 0
for i in range(test_episodes):
    env.reset()
    done = False
    test_state = env.init_state
    state_list = []
    while not done:
        state_list.append(test_state)
        action = np.argmax(q_table[test_state,:])
        next_state, reward, done, _ = env.step(action)
        test_state = next_state
        test_steps = test_steps + 1
        test_total_reward = test_total_reward + reward


print("The average results of {} episodes are steps {}, reward {}".format(test_episodes, steps/test_episodes, total_reward/test_episodes))

In [None]:
total_avg_reward = total_reward/test_episodes
# Print results in CSV format and upload to Kaggle
if not os.path.exists(outputDir):
    os.mkdir(outputDir)
with open(outputDir+'%s' %filename, 'w') as f:
    f.write('Id,Predicted\n')
    f.write('FrozenLake8x8_public,{}\n'.format(total_avg_reward))
    f.write('FrozenLake8x8_private,{}\n'.format(total_avg_reward))