In [1]:
## import numpy as np
import gym
import matplotlib.pyplot as plt

# Import and initialize Mountain Car Environment
env = gym.make('MountainCar-v0')
env.reset()

# Default is 200
env._max_episode_steps = 5000

# Define Q-learning function
def QLearning(env, learning, discount, epsilon, min_eps, episodes):
    # Determine size of discretized state space
    #num_states = (env.observation_space.high - env.observation_space.low)*\
    #                np.array([10, 100])
    num_states = (env.observation_space.high - env.observation_space.low)*\
                    np.array([100, 1000])
    num_states = np.round(num_states, 0).astype(int) + 1
    
    # Initialize Q table
    ### low = -1
    Q = np.random.uniform(low = 1, high = 1, 
                          size = (num_states[0], num_states[1], 
                                  env.action_space.n))
    
    # Initialize variables to track rewards
    reward_list = []
    ave_reward_list = []
    
    # Calculate episodic reduction in epsilon
    reduction = (epsilon - min_eps)/episodes
    
    # Run Q learning algorithm
    for i in range(episodes):
        # Initialize parameters
        done = False
        tot_reward, reward = 0,0
        state = env.reset()
        
        # Default is 200
        env._max_episode_steps = 5000
        
        # Discretize state
        state_adj = (state - env.observation_space.low)*np.array([100, 1000])
        state_adj = np.round(state_adj, 0).astype(int)
    
        while done != True:   
            # Render environment for last five episodes
            if i >= (episodes - 5):
                z = 1
                #env.render()
                
            # Determine next action - epsilon greedy strategy
            if np.random.random() < 1 - epsilon:
                action = np.argmax(Q[state_adj[0], state_adj[1]]) 
            else:
                action = np.random.randint(0, env.action_space.n)
                
            # Get next state and reward
            state2, reward, done, info = env.step(action) 
            
            # Discretize state2
            state2_adj = (state2 - env.observation_space.low)*np.array([100, 1000])
            state2_adj = np.round(state2_adj, 0).astype(int)
            
            #Allow for terminal states
            if done and state2[0] >= 0.5:
                Q[state_adj[0], state_adj[1], action] = reward
                
            # Adjust Q value for current state
            else:
                delta = learning*(reward + 
                                 discount*np.max(Q[state2_adj[0], 
                                                   state2_adj[1]]) - 
                                 Q[state_adj[0], state_adj[1],action])
                Q[state_adj[0], state_adj[1],action] += delta
                                     
            # Update variables
            tot_reward += reward
            state_adj = state2_adj
        
        # Decay epsilon
        if epsilon > min_eps:
            epsilon -= reduction
        
        # Track rewards
        reward_list.append(tot_reward)
        
        if (i+1) % 500 == 0:
            ave_reward = np.mean(reward_list)
            ave_reward_list.append(ave_reward)
            reward_list = []
            print('Episode {} Average Reward: {}'.format(i+1, ave_reward))
    env.close()
    
    return ave_reward_list, Q

# Run Q-learning algorithm
# env, learning, discount, epsilon, min_eps, episodes
# Optimal: env, 0.2, 0.98, 0.95, 0, 100000
#rewards, Q = QLearning(env, 0.2, 0.98, 0.95, 0, 100000)


In [74]:
import plotly.graph_objects as go

# env, learning, discount, epsilon, min_eps, episodes
# env, 0.2, 0.98, 0.95, 0, 100000
rewards, Q = QLearning(env, 0.2, 0.98, 0.95, 0, 100000)

Q_opt = np.argmax(Q, axis= 2)
fig = go.Figure(data=go.Heatmap(
                    z=Q_opt))
fig.update_xaxes(title_text='Velocity')
fig.update_yaxes(title_text='Position')
fig.show()


pi_opt = np.max(Q, axis= 2)
fig = go.Figure(data=go.Heatmap(
                    z=pi_opt))
fig.update_xaxes(title_text='Velocity')
fig.update_yaxes(title_text='Position')

fig.show()

In [2]:
import math