In [1]:
import gym
from gym import wrappers
import random
import math
import matplotlib.pyplot as plt
from itertools import count
import numpy as np
from collections import namedtuple
import pandas as pd

In [2]:
env = gym.make('CartPole-v0')

In [3]:
POS = 2.4
VEL = 3.7
ANG = 0.27
AGV = 3.5

class QTable:
    def __init__(self):
        # state/action/value : Q
        # state => (act_0_value, act_1_value)
        # Qsa => [Q(s, a0), Q(s, a1)]
        # Qsa0 => Qsa[0] => Q(s, a0)
        # Qsa1 => Qsa[1] => Q(s, a1)
        self.Q = {}
        
        # state => (act_0_cnt, act_1_cnt)
        # Nsa => [N(s, a0), N(s, a1)]
        self.N = {}

    # set v(s, a)
    def set_v(self, state, action, value):
        state_k = self._get_state_key(state)
        if state_k not in self.Q:
            self.Q[state_k] = (0, 0)

        Qsa0, Qsa1 = self.Q[state_k]
        Qsa = (value if action == 0 else Qsa0, value if action == 1 else Qsa1)
        self.Q[state_k] = Qsa
        return (Qsa[0], Qsa[1], state_k)

    # get (Q(s, a0), Q(s, a1))
    def get_v(self, state):
        state_k = self._get_state_key(state)
        if state_k not in self.Q:
            return (0, 0, state_k)
        
        Qsa = self.Q[state_k]
        return (Qsa[0], Qsa[1], state_k)
    
    def get_cnt(self, state):
        state_k = self._get_state_key(state)
        if state_k not in self.N:
            return (0, 0, state_k)
        
        Nsa = self.N[state_k]
        return (Nsa[0], Nsa[1], state_k)

    def inc_cnt(self, state, action):
        state_k = self._get_state_key(state)
        if state_k not in self.N:
            self.N[state_k] = (0, 0)

        Nsa0, Nsa1 = self.N[state_k]
        Nsa = (Nsa0+1 if action == 0 else Nsa0, Nsa1+1 if action == 1 else Nsa1)
        self.N[state_k] = Nsa
        return (Nsa[0], Nsa[1], state_k)
        
    # return maxarg Q(s|a') -> (action, value)
    def get_max_action_value(self, state):
        state_k = self._get_state_key(state)
        a0v, a1v, _ = self.get_v(state)
        return (0, a0v) if a0v > a1v else (1, a1v)

    def sample(self, size):
        state_ks = random.sample(list(self.Q), size) 
        return [self.Q[k] for k in state_ks]

    def _get_state_key(self, state):
        pos = state[0] / POS * 10
        vel = state[1] / VEL * 10
        ang = state[2] / ANG * 10
        agv = state[3] / AGV * 10
        return '{:.0f},{:.0f},{:.0f},{:.0f}'.format(pos, state[1], state[2], state[3])
        
    def __len__(self):
        return len(self.Q)
    
    def get_dataframe(self):
        Qsa = np.array([(k, s[0], s[1]) for k, s in self.Q.items()])
        Nsa = np.array([(k, s[0], s[1]) for k, s in self.N.items()])
        dfQ = pd.DataFrame(Qsa, columns=['state', 'Q a0', 'Q a1'])
        dfQ = dfQ.set_index('state')
        
        dfN = pd.DataFrame(Nsa, columns=['state', 'N a0', 'N a1'])
        dfN = dfN.set_index('state')
        
        df = dfQ+dfN
        return df
    
q_table = QTable()

In [4]:
class Agent:
    EPS_START = 0.9  # e-greedy threshold start value
    EPS_END = 0.1  # e-greedy threshold end value
    EPS_DECAY = 200  # e-greedy threshold decay
    AVE = 100        
    ALPHA = 0.05
    GAMMA = 0.95
    MC_GAMMA = 0.3
        
    def __init__(self):
        self.episode_durations = []
        self.sar_list = []
        self.steps_done = 0
        self.cur_state = None
        self.plot_kernel = np.ones(self.AVE) / self.AVE
        
    def select_action(self, state):
        self.cur_state = state
        self.steps_done += 1
        act = q_table.get_max_action_value(state)
        return self._e_greedy(act[0])

    def _e_greedy(self, act):
        sample = random.random()
        eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * math.exp(-1. * self.steps_done / self.EPS_DECAY)
        if (act is not None) and (sample > eps_threshold):
            return act
        return random.randrange(2)
    
    # Temporal Difference
    def TD_learn_state(self, next_state, action, reward):

        
        max_next_value = q_table.get_max_action_value(next_state)[1]
        sav = q_table.get_v(self.cur_state)[action]
        sav = sav + self.ALPHA * (reward + self.GAMMA * max_next_value - sav)
        q_table.set_v(self.cur_state, action, sav)
        self.cur_state = next_state
    
    # Monte carlo
    def MC_keep_reward(self, state, action, reward):
        self.sar_list.append((state, action, reward))
        q_table.inc_cnt(state, action)

    def MC_learn_episode(self):
        states, actions, rewards = zip(*self.sar_list)
        n_rewards = len(rewards)
        
        for t in range(n_rewards):
            s = states[t]
            a = actions[t]
            Gt = np.array([rewards[p] * self.MC_GAMMA**i for i, p  in enumerate(range(t, n_rewards))]).sum()
            sav = q_table.get_v(s)[a]
            cnt = q_table.get_cnt(s)[a]
            sav += (Gt - sav)/cnt
            q_table.set_v(s, a, sav)
        
        self.sar_list = []

    def plot_durations(self, t, rt_plot):
        self.episode_durations.append(t)
        means = None
        
        if rt_plot:
            plt.figure(2)
            plt.clf()
            plt.title('Training...')
            plt.xlabel('Episode')
            plt.ylabel('Duration')
            plt.plot(self.episode_durations)
            
            if len(self.episode_durations) >= self.AVE:
                means = np.convolve(self.episode_durations, self.plot_kernel, 'valid')
                plt.plot(means)

            plt.pause(0.001)  # pause a bit so that plots are updated
        
        return 0 if means is None else means[-1]
        
agent = Agent()

In [5]:
EPISODES = 100  # number of episodes
MAX_STEP = 200

def run_episode(rt_plot):
    plt.ion()
    
    for e in range(EPISODES):
        state = env.reset()
        mean = 0
        for t in count(): 
            #env.render()
            action = agent.select_action(state)
            next_state, reward, done, _ = env.step(action)
           
            if done and t < MAX_STEP - 1:
                reward = -MAX_STEP
            
            agent.MC_keep_reward(state, action, reward)
            #agent.TD_learn_state(state, action, reward)
            
            state = next_state
            
            if done:
                agent.MC_learn_episode()
                mean = agent.plot_durations(t, rt_plot)
                break
        
        if (e+1) % 1000 == 0:
            print("{2} Episode {0} finished after ave.{1} steps".format(e, mean, '\033[92m' if mean>= 195 else '\033[99m'))
  
    env.close()
    plt.ioff()
    plt.show()

In [8]:
EPISODES = 1000
%matplotlib
run_episode(True)
%matplotlib inline

agent.plot_durations(0, True)

Using matplotlib backend: TkAgg


KeyboardInterrupt: 

In [8]:
from itertools import count
rewards = []
states = []

for trails in range(100):
    state = env.reset()
    reward = 0
    for t in count():
        #env.render()
        action = agent.select_action(state)
        state, r, done, _ = env.step(action)
        states.append(state)
        reward += r
        if done:
            rewards.append(reward)
            #print(reward)
            break

env.close()
print("rewards:", np.average(rewards))
pos, vel, ang, avel = zip(*states) 
print("pos", np.array(pos).min(), np.array(pos).max())
print("vel", np.array(vel).min(), np.array(vel).max())
print("ang", np.array(ang).min(), np.array(ang).max())
print("avel", np.array(avel).min(), np.array(avel).max())

rewards: 133.31
pos -1.4620462119926267 1.5939351126502084
vel -2.0472172504582313 2.668858053450312
ang -0.25152135311153184 0.257933930763401
avel -3.2706476417484476 2.7884078001497468
