### Mountain Car

1. 문제: 
2. 조건
    - state: 자동차의 위치($x_t$)
    - action: 앞으로 가속(+1), 뒤로 가속(-1), 가속 없음(0)
    - reward: 모든 time step(-1), 꼭대기에 도달 시(0)
    - terminal: 꼭대기
    - 고려 사항: 
        - 왼쪽 끝에 도달 시 위치가 0으로 reset 됨. 
        - 속도는 
3. 학습 목표
    - Episodic Semi-gradient Sarsa

In [1]:
import numpy as np
from math import cos
from random import uniform
from TileCoding import *

In [2]:
ACTION_REVERSE = -1
ACTION_ZERO = 0
ACTION_FORWARD = 1
ACTIONS = [ACTION_REVERSE, ACTION_ZERO, ACTION_FORWARD]

# bound for position and velocity
POSITION_MIN = -1.2
POSITION_MAX = 0.5
VELOCITY_MIN = -0.07
VELOCITY_MAX = 0.07

In [56]:
class MountainCar:
    
    def __init__(self):
        self.reset()
        
    def step(self, action):
        done = False
        reward = -1
        self.velocity = self.velocity + 0.001 * action - 0.0025 * cos(3 * self.state)
        self.velocity = max(min(self.velocity, VELOCITY_MAX), VELOCITY_MIN)
        self.state += self.velocity
        self.state = max(min(self.state, POSITION_MAX), POSITION_MIN)
        if self.state == POSITION_MIN:
            self.velocity = 0 
        elif self.state == POSITION_MAX:
            done = True
            reward = 0
        return (self.state, self.velocity), reward, done
    
    def reset(self):
        self.state = uniform(-0.6, -0.4)
        self.velocity = 0 
        return (self.state, self.velocity)

In [87]:
env = MountainCar()

In [88]:
env.reset()

(-0.4711940734531411, 0)

In [130]:
state, velo = env.step(1)[0]
state, velo = state * position_scale, velo * velocity_scale

In [90]:
hash_table = IHT(2048)

In [131]:
tiles(hash_table, 8, [state, velo], [1])

[8, 9, 10, 11, 12, 13, 14, 15]

In [132]:
hash_table.dictionary

{(0, -3, -1, -1): 0,
 (0, -3, -1, 0): 16,
 (0, -3, -1, 1): 8,
 (0, -3, 0, 0): 25,
 (0, -3, 0, 1): 24,
 (1, -3, 0, -1): 1,
 (1, -3, 0, 0): 17,
 (1, -3, 0, 1): 9,
 (2, -2, 0, -1): 2,
 (2, -2, 0, 0): 18,
 (2, -2, 0, 1): 10,
 (3, -2, 1, -1): 3,
 (3, -2, 1, 0): 19,
 (3, -2, 1, 1): 11,
 (4, -2, 1, -1): 4,
 (4, -2, 1, 0): 20,
 (4, -2, 1, 1): 12,
 (5, -2, 1, -1): 5,
 (5, -2, 1, 0): 21,
 (5, -2, 1, 1): 13,
 (6, -2, 2, -1): 6,
 (6, -2, 2, 0): 22,
 (6, -2, 2, 1): 14,
 (7, -2, 2, -1): 7,
 (7, -2, 2, 0): 23,
 (7, -2, 2, 1): 15}

In [43]:
position_scale = 8 / (POSITION_MAX - POSITION_MIN)
velocity_scale = 8 / (VELOCITY_MAX - VELOCITY_MIN)

In [None]:
tiles(hash_table, 8, [0, ])

In [None]:
class ValueFunction:
    # In this example I use the tiling software instead of implementing standard tiling by myself
    # One important thing is that tiling is only a map from (state, action) to a series of indices
    # It doesn't matter whether the indices have meaning, only if this map satisfy some property
    # View the following webpage for more information
    # http://incompleteideas.net/sutton/tiles/tiles3.html
    # @maxSize: the maximum # of indices
    # max_size가 2048이라는 건 전체 tile에 대한 weight 수가 2048개라는 것이고 tiling이 8개니까 space를 dim당 16개로 쪼갠거다. 
    def __init__(self, step_size, num_tilings=8, max_size=2048):
        self.max_size = max_size
        self.num_tilings = num_tilings

        # divide step size equally to each tiling
        self.step_size = step_size / num_tilings

        self.hash_table = IHT(max_size)

        # weight for each tile
        self.weights = np.zeros(max_size)

        # position and velocity needs scaling to satisfy the tile software
        self.position_scale = self.num_tilings / (POSITION_MAX - POSITION_MIN)
        self.velocity_scale = self.num_tilings / (VELOCITY_MAX - VELOCITY_MIN)

    # get indices of active tiles for given state and action
    def get_active_tiles(self, position, velocity, action):
        # I think positionScale * (position - position_min) would be a good normalization.
        # However positionScale * position_min is a constant, so it's ok to ignore it.
        active_tiles = tiles(self.hash_table, self.num_tilings,
                            [self.position_scale * position, self.velocity_scale * velocity],
                            [action])
        return active_tiles

    # estimate the value of given state and action
    def value(self, position, velocity, action):
        if position == POSITION_MAX:
            return 0.0
        active_tiles = self.get_active_tiles(position, velocity, action)
        return np.sum(self.weights[active_tiles])

    # learn with given state, action and target
    def learn(self, position, velocity, action, target):
        active_tiles = self.get_active_tiles(position, velocity, action)
        estimation = np.sum(self.weights[active_tiles])
        delta = self.stepSize * (target - estimation)
        for active_tile in active_tiles:
            self.weights[active_tile] += delta

    # get # of steps to reach the goal under current state value function
    def cost_to_go(self, position, velocity):
        costs = []
        for action in ACTIONS:
            costs.append(self.value(position, velocity, action))
        return -np.max(costs)

In [6]:
from TileCoding import *

In [8]:
hashtable = IHT(1000)

In [11]:
tiles(hashtable, 8, [0.5, 0.5], [1, 1])

[0, 1, 2, 3, 4, 5, 6, 7]