In [None]:
# Visualise

In [None]:
import gym
env = gym.make('CartPole-v0')
for i_episode in range(1):
    observation = env.reset()
    for t in range(100):
        env.render()
        #print(reward)
        action = env.action_space.sample()
        observation, reward, done, info = env.step(action)
        print(reward,done,observation)
        print(env.observation_space.high)
        if done:
            print("Episode finished after {} timesteps".format(t+1))
            break
env.close()

In [None]:
### Policy Gradient ###

In [None]:
import gym
import numpy as np
import matplotlib.pyplot as plt
import copy 

#Hyperparameters
NUM_EPISODES = 10000
LEARNING_RATE = 0.000025
GAMMA = 0.99

# Create gym
env = gym.make('CartPole-v0')
nA = env.action_space.n
np.random.seed(1)

# Init weight
w = np.random.rand(4, 2)

# rewards
episode_rewards = []

# Our policy
def policy(state,w):
    z = state.dot(w)
    exp = np.exp(z)
    return exp/np.sum(exp)

# Vectorized softmax Jacobian
def softmax_grad(softmax):
    s = softmax.reshape(-1,1)
    return np.diagflat(s) - np.dot(s, s.T)

# Main loop 
for e in range(NUM_EPISODES):

    state = env.reset()[None,:]

    grads = []
    rewards = []

    # single episode score
    score = 0

    while True:

        # Uncomment to visualise
        #env.render()

        # Sample from policy and take action in environment
        probs = policy(state,w)
        action = np.random.choice(nA,p=probs[0])
        next_state,reward,done,_ = env.step(action)
        next_state = next_state[None,:]

        # Compute gradient and save with reward in memory for our weight updates
        dsoftmax = softmax_grad(probs)[action,:]
        dlog = dsoftmax / probs[0,action]
        grad = state.T.dot(dlog[None,:])

        grads.append(grad)
        rewards.append(reward)

        score+=reward

        # update
        state = next_state

        if done:
            break

    # Weight update
    for i in range(len(grads)):

        # Loop through everything that happend in the episode and update towards the log policy gradient times **FUTURE** reward
        w += LEARNING_RATE * grads[i] * sum([ r * (GAMMA ** r) for t,r in enumerate(rewards[i:])])
    
    # Append for logging and print
    episode_rewards.append(score) 
    print("EP: " + str(e) + " Score: " + str(score) + "         ",end="\r", flush=False) 
matplotlib.rc('figure', figsize=(10, 5))
plt.plot(np.arange(NUM_EPISODES),episode_rewards)
plt.show()
env.close()

In [None]:
import matplotlib
matplotlib.rc('figure', figsize=(100, 5))
plt.plot(np.arange(NUM_EPISODES),episode_rewards)

In [None]:
### Q-Learning ###

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
import numpy as np 
import time, math, random
from typing import Tuple 
import gym

env = gym.make('CartPole-v1')

n_bins = ( 6 , 12 )
lower_bounds = [ env.observation_space.low[2], -math.radians(50) ]
upper_bounds = [ env.observation_space.high[2], math.radians(50) ]

def discretizer( _ , __ , angle, pole_velocity ) -> Tuple[int,...]:
    """Convert continues state intro a discrete state"""
    est = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
    est.fit([lower_bounds, upper_bounds ])
    return tuple(map(int,est.transform([[angle, pole_velocity]])[0]))

Q_table = np.zeros(n_bins + (env.action_space.n,))


In [None]:
n_episodes = 10000 

for e in range(n_episodes):
    
    # Siscretize state into buckets
    current_state, done = discretizer(*env.reset()), False
    score=0
    while done==False:
        
        # policy action 
        action = np.argmax(Q_table[current_state]) # exploit
        
        # insert random action
        if np.random.random() < max(0.1, min(1, 1.0 - math.log10((e  + 1) / 25))) : 
            action = env.action_space.sample() # explore 
         
        # increment enviroment
        obs, reward, done, _ = env.step(action)
        new_state = discretizer(*obs)
        score+=reward
        # Update Q-Table
        lr = max(0.01, min(1.0, 1.0 - math.log10((e + 1) / 25)))
        future_optimal_value = np.max(Q_table[new_state])
        learned_value = reward + 1 * future_optimal_value
        learnt_value = learned_value
        old_value = Q_table[current_state][action]
        Q_table[current_state][action] = (1-lr)*old_value + lr*learnt_value
        
        current_state = new_state
        
        # uncomment to render the cartpole environment
        #env.render()
    print("EP: " + str(e) + " Score: " + str(score) + "         ",end="\r", flush=False)