In [79]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import random
from mountain_car import MountainCar
from tilecoding import TileCoder
%matplotlib inline

In [2]:
env = gym.make('MountainCar-v0')

In [3]:
tiles_per_dim = [8, 8]
pos_lims = (env.observation_space.low[0], env.observation_space.high[0])
spd_lims = (env.observation_space.low[1], env.observation_space.high[1])

In [4]:
lims = [pos_lims, spd_lims]
n_tilings = 4

In [5]:
T = TileCoder(tiles_per_dim, lims, n_tilings)

In [6]:
T[env.reset()]

array([ 39, 120, 201, 283])

In [19]:
qtest= np.zeros((T.n_tiles, env.action_space.n))

In [37]:
np.array([qtest[T[env.reset()]]])

array([[[0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.],
        [0., 0., 0.]]])

In [36]:
qtest[T[env.reset()]].shape

(4, 3)

In [31]:
qtest[T[env.reset()], 1]

array([0., 0., 0., 0.])

In [26]:
np.sum(qtest[T[env.reset()]])

0.0

In [29]:
a = [1,2,3]
b = [4,5,6]
c = [7,8,9]

abc = np.array([a,b,c])

In [66]:
a = T[env.reset()]

In [67]:
qtest[a, 1] = 10

In [68]:
qtest[a]

array([[ 0., 10.,  0.],
       [ 0., 10.,  0.],
       [ 0., 10.,  0.],
       [ 0., 10.,  0.]])

In [87]:
np.max(sum(qtest[a])/4)

10.0

In [84]:
sum(qtest[a, 1])/4

10.0

In [85]:
class MountainCarTiles:
    def __init__(self, env, algo, alpha, gamma, epsilon, tiles_per_dim, n_tilings):
        self.env = env
        self.algo = algo
        self.epsilon = epsilon
        self.gamma = gamma
        self.alpha = alpha
        self.tiles_per_dim = tiles_per_dim
        self.n_tilings = n_tilings
        lims = [(self.env.observation_space.low[0], self.env.observation_space.high[0]), /
                (self.env.observation_space.low[1], self.env.observation_space.high[1])]
        self.T = TileCoder(self.tiles_per_dim, lims, self.n_tilings)
        self.Q = np.zeros((self.T.n_tiles, self.env.action_space.n))
        self.ep_reward = []
        self.reward_list = []
        self.episodes = 0
    
    def _discretize_state(self, state):
        return T[state]
    
    def _get_mean_tile_Q(self, discrete_state):
        return sum(self.Q[discrete_state])/self.n_tilings
        
    def _epsilon_greed(self, discrete_state):
        if random.random() < self.epsilon:
            return random.randint(0, self.env.action_space.n-1)
        else:
            # use mean of states' rewards as reward
            return np.argmax(self._get_mean_tile_Q(discrete_state))
    
    def _get_expected_reward(self, discrete_state):
        actions = [i for i in range(self.env.action_space.n)]
        # Find greedy action, (1-ep)*greedy reward
        greedy_action = np.argmax(self._get_mean_tile_Q(discrete_state))
        greedy_reward = (1-self.epsilon) * self._get_mean_tile_Q(discrete_state)[greedy_action]
        # other actions, ep/n_actions * reward
        other_rewards = 0
        for a in actions:
            other_rewards += (self.epsilon/len(actions)) * self._get_mean_tile_Q(discrete_state)[a]
        return greedy_reward + other_rewards
    
    def _calc_avg_reward(self):
        avg_ep_reward = np.mean(self.ep_reward)
        self.reward_list.append(avg_ep_reward)
        self.ep_reward = []
        return(avg_ep_reward)
    
    def run_episode(self, decay, render = False):
        if self.episodes % 500 == 0 or render:
            print(f'Running episode {self.episodes} using {self.algo}, epsilon={round(self.epsilon,5)}, alpha={self.alpha}, discount={self.gamma}')
        end = False
        total_reward, reward = 0,0
        # Initial State
        S = self.env.reset()
        # Discreize State
        S_dis = self._discretize_state(S)
        while not end:
            if render:
                env.render()
                
            action = self._epsilon_greed(S_dis)
            S_next, reward, end, _ = self.env.step(action)
            S_dis_next = self._discretize_state(S_next)
            
            # If end of episode
            if end and S_next[0] >= 0.5:
                self.Q[S_dis, action] = reward
                
            # otherwise update according to chosen algorithm
            else:
                if self.algo == 'q':
                    next_reward = np.max(self._get_mean_tile_Q(S_dis_next))
                elif self.algo == 'expected_sarsa':
                    next_reward = self._get_expected_reward(S_dis_next)
                elif self.algo == 'sarsa':
                    next_reward = self._get_mean_tile_Q(S_dis_next)[self._epsilon_greed(S_dis_next)]
                    
                delta = self.alpha * (reward + (self.gamma * next_reward) - self._get_mean_tile_Q(S_dis)[action])
                self._get_mean_tile_Q(S_dis_next)[action] += delta
                
            S_dis = S_dis_next
            total_reward += reward
            
        if render:
            if (S_next[0] >= 0.5):
                print('Success :)')
            else: 
                print('Failure :(')
            time.sleep(1)
            
        self.ep_reward.append(total_reward)
        self.episodes += 1
        self.epsilon -= decay
        if self.episodes % 100 == 0:
            avg_reward = self._calc_avg_reward()
            if self.episodes % 500 == 0:
                print(f'Avg Reward Over Last 100 Episodes = {avg_reward}...')

SyntaxError: invalid syntax (<ipython-input-85-d3715ab97b8c>, line 10)