In [None]:
import gymnasium as gym
import numpy as np
import matplotlib.pyplot as plt
env = gym.make("CartPole-v1", render_mode="human")
print(gym.__version__)

In [None]:
# state: 4 elements: car position, cart velocity, pole angle, pole angular velocity
print(env.observation_space.shape)
print(env.observation_space.low)
print(env.observation_space.high)
print(env.action_space)

In [None]:
# The episode terminates if the pole angle is not in the range (-.2095, .2095)

# random action
for episode in range(10):
    state, info = env.reset()
    done = False
    score = 0
    for t in range(100):
    # while not terminated:
        action = env.action_space.sample()
        state, reward, done, truncated, info = env.step(action)
        env.render()
        score += reward
        print('step {}, action {}, reward {}, state {}'.format(t,action,reward*(not done), state))
        print(f"Episode {episode} score {score}")
       # if done:
        #    break

In [22]:
# agent class that will be used in Q-learning

class Agent:
    
    def __init__(self, policy='random'):
        self.total_reward = 0
        self.policy = policy
        self.alpha = 0.1  # learning rate
        self.epsilon = 1  # epsilon-greedy
        self.gamma = 1 # discount factor, DON'T CHANGE THIS VALUE FOR HOMEWORK problem 2. 
        self.theta = np.zeros([2,1,4,3,2])
        
    def obs_index(self, state):
        bins = (np.array([0]),
                np.array([1e20]),
                np.array([-0.2, 0, 0.2]),
                np.array([-0.3, 0.3])
               )
        ind=np.zeros(4).astype(int)
        for i in range(len(state)):
            ind[i] = np.digitize(state[i],bins[i])
        return tuple(ind)
    
    def q(self, state):
        ind = self.obs_index(state)
        return self.theta[ind]
    
    def q_update(self, last_state, action, reward, state):
        ind = self.obs_index(state)
        ind_last = self.obs_index(last_state)
        delta = (reward + self.gamma*np.max(self.theta[ind]) - self.theta[ind_last+(action,)])
        self.theta[ind_last+(action,)] += self.alpha*delta

    def choose_action(self, state, episode):
        if self.policy=='random':
            return int(np.round(np.random.random()))
        elif self.policy=='eps_greedy':
            if np.random.rand()>self.epsilon:
                if self.q(state)[0]>self.q(state)[1]:
                    return 0
                else:
                    return 1
            else:
                return int(np.round(np.random.random()))

    def gather_reward(self, reward, t):
        self.total_reward += (self.gamma**t)*reward
    def get_total_reward(self):
        return self.total_reward
    def set_total_reward(self, new_total):
        self.total_reward = new_total
            
        
    

In [None]:
agent.theta[1,0, 0,0,]

In [None]:
# training q-learning with epsilon-greedy action
policy = 'eps_greedy'
agent = Agent(policy)
ep_rewards = []

epi_length = 500 # number of episodes in training
maxT = 51        # maximal number of steps in each episode
for episode in range(epi_length):
    last_state, info = env.reset()
    agent.set_total_reward(0)
    done = False
    for t in range(maxT):
        action = agent.choose_action(last_state, episode)
        state,reward,done,truncated,info=env.step(action)
        agent.gather_reward(reward,t)
        agent.q_update(last_state,action,reward,state)
        last_state=state
        # env.render()
        if done==True or t==maxT-1:
            ep_rewards.append(agent.get_total_reward())
            print(episode, t, agent.get_total_reward())
            break

In [None]:
# optimal Q-function learned from Q-learning
opt_theta = agent.theta
opt_theta

In [34]:
# optimal policy learned from Q-learning
def opt_action(theta, state):
    ind = agent.obs_index(state)
    if theta[ind][0]> theta[ind][1]:
        return 0
    else:
        return 1

In [None]:
# run estimated optimal policy on CartPole for 50 episodes and a maximal 201 steps in each episode

ep_rewards = []
for episode in range(50):
    last_state, info = env.reset()
    agent.set_total_reward(0)
    done = False
    for t in range(201):
        action = opt_action(opt_theta, last_state)
        state,reward,done,truncated,info=env.step(action)
        agent.gather_reward(reward,t)
        last_state=state
           # print(t+1, agent.get_total_reward(), done, episode, state)
        env.render()
        if done==True or t==200:
            ep_rewards.append(agent.get_total_reward())
            print(episode, t, agent.get_total_reward())
            break
       
avg_reward = np.round(np.mean(ep_rewards),1)
sd_reward = np.round(np.std(ep_rewards),1)
plt.plot(ep_rewards)
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('Performance of Estimated Optimal Policy on CartPole')
plt.figtext(0.5, -0.1, f"Total return per episode (mean ± sd over 50 episodes): {avg_reward} ± {sd_reward}", wrap=True, horizontalalignment='center', fontsize=12)
plt.show()