In [1]:
#!pip install numpy
#!pip install gym
#!pip install matplotlib
import numpy as np
import gym
import matplotlib.pyplot as plt

In [2]:
pos_buckets = np.linspace(-1.2, 0.6, 20)
vel_buckets = np.linspace(-0.07, 0.07, 20)

In [3]:
def toDiscreteStates(obs):
    pos, vel = obs
    pos_disc = np.digitize(pos, pos_buckets)
    vel_disc = np.digitize(vel, vel_buckets)
    
    return(pos_disc, vel_disc)

In [4]:
def max_action(Q, obs, actions=[0,1,2]):
    state = toDiscreteStates(obs)
    values = []
    for a in actions:
        values.append(Q[state, a])
    
    action = np.argmax(values)
    
    return action

In [5]:
env = gym.make('MountainCar-v0')
env._max_episode_steps = 1000
n_games = 50000
alpha = 0.1
gamma = 0.99
eps = 1.0

In [6]:
states = []
for pos in range(21):
    for vel in range(21):
        states.append((pos,vel))

In [7]:
Q = {}
for state in states:
    for action in [0,1,2]:
        Q[state, action] = 0

In [8]:
def saveQ():
    with open('obj/Qtable.pkl', 'wb') as f:
        pickle.dump(Q, f, pickle.HIGHEST_PROTOCOL)

In [9]:
def loadQ(obs, action, reward, new_obs_new_action):
    with open('obj/Qtable.pkl', 'rb') as f:
        pickle.load(f)

In [10]:
def updateQ(obs, action, reward, new_obs, new_action):
    state = toDiscreteStates(obs)
    new_state = toDiscreteStates(new_obs)
    Q[state, action] = Q[state, action] + alpha*(reward + gamma*Q[new_state, new_action] - Q[state, action])

In [11]:
def show_results():
    Q = loadQ()
    new_eps = 0.01
    for i in range(10):
        done = False
        obs = env.reset()
        while not done:
            if np.random.random() < new_eps:
                action = np.random.choice ([0,1,2])
            else:
                action = max_action(Q, obs)
                
            new_obs, reward, done, info = env.step(action)
            new_action = max_action(Q, new_obs)
            updateQ(obs, action, reward, new_obs, new_action)
            obs = new_obs
            env.render()

In [None]:
score = 0
total_rewards = np.zeros(n_games)

for i in range(n_games):
    done = False
    obs = env.reset()
    if i % 100 == 0 and i>0:
        print('episode ', i, 'score ', score, 'eps ', eps)
    score = 0
    while not done:
        if np.random.random() < eps:
            action = np.random.choice([0,1,2])
        else:
            action = max_action(Q, obs)
        
        new_obs, reward, done, info = env.step(action)
        score += reward
        new_action = max_action(Q, new_obs)
        updateQ(obs, action, reward, new_obs, new_action)
        obs = new_obs
    total_rewards[i] = score
    if eps > 0.01:
        eps = eps - 2/n_games
    else:
        eps = 0.01

episode  100 score  -1000.0 eps  0.995999999999996
episode  200 score  -1000.0 eps  0.991999999999992
episode  300 score  -1000.0 eps  0.987999999999988
episode  400 score  -948.0 eps  0.983999999999984
episode  500 score  -1000.0 eps  0.97999999999998
episode  600 score  -1000.0 eps  0.975999999999976
episode  700 score  -1000.0 eps  0.971999999999972
episode  800 score  -1000.0 eps  0.967999999999968
episode  900 score  -1000.0 eps  0.963999999999964
episode  1000 score  -1000.0 eps  0.95999999999996
episode  1100 score  -1000.0 eps  0.955999999999956
episode  1200 score  -1000.0 eps  0.951999999999952
episode  1300 score  -1000.0 eps  0.947999999999948
episode  1400 score  -1000.0 eps  0.943999999999944
episode  1500 score  -1000.0 eps  0.93999999999994
episode  1600 score  -1000.0 eps  0.935999999999936
episode  1700 score  -1000.0 eps  0.931999999999932
episode  1800 score  -1000.0 eps  0.927999999999928
episode  1900 score  -724.0 eps  0.923999999999924
episode  2000 score  -1000

In [None]:
saveQ()
show_result()
mean_rewards = np.zeros(n_games)
for r in range(n_games):
    mean_rewards[r] = np.mean(total_rewards[max(0, r-50):(r+1)])
plt.plot(mean_rewards)
plt.savefig('mountaincar.png')