In [1]:
import gym
import numpy as np

env = gym.make("CartPole-v0")

def random_policy():
    return np.random.randint(2)

def v_hat(s,w):
    return np.dot(s,w)

def gradient_MC_prediction(env,policy,n_episodes,alpha,gamma):
    
    #initialize weight vector
    w = 0
    
    for n in range(n_episodes):
        
        observation = env.reset()
        state_reward_pairs = []
        
        #generate episode with 200 time steps
        for t in range(200):
            action = policy()
            old_observation = observation
            observation, reward, done, info = env.step(action)
            state_reward_pairs.append((old_observation,reward))
            if done:
                break
        
        #update values
        G = 0
        #loop over states, going backwards from the terminal state
        for state, reward in state_reward_pairs[::-1]:
            G = reward + gamma * G
            gradient = state
            w = w + alpha * (G - v_hat(state,w))*gradient
    return w

In [2]:
w = gradient_MC_prediction(env=env,policy=random_policy,n_episodes=1000,alpha=0.2,gamma=0.9)

In [3]:
w

array([11.6523881 ,  9.80457032, -4.43589506,  1.29644459])

In [4]:
alphas = [0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.8,0.9]
for alpha in alphas:
    w = gradient_MC_prediction(env=env,policy=random_policy,n_episodes=1000,alpha=alpha,gamma=0.9)
    print("alpha:",alpha,"w:", w)

alpha: 0.01 w: [-1.51909273 -2.98042069 -0.53084231 -1.49700245]
alpha: 0.1 w: [-8.82191202 -2.68125932 -2.17090172  2.36366772]
alpha: 0.2 w: [15.71433505  6.84398962 13.34459815 -2.32309068]
alpha: 0.3 w: [-7.02615394  2.86014471  1.99652091 -3.52621992]
alpha: 0.4 w: [24.75237855 -1.95356415 18.42406236  6.64008834]
alpha: 0.5 w: [1609.21580031 -724.30903172 1643.23988688 -465.68414919]
alpha: 0.6 w: [-5178.57756877  3061.33069874 -7456.08744277  -913.5801661 ]
alpha: 0.8 w: [ 1.25582232e+41 -4.01048849e+40  1.44559618e+41 -2.81099432e+40]
alpha: 0.9 w: [ 1.65721048e+17 -5.01969466e+16  1.58987919e+17 -4.46896342e+16]


When alpha is 0.5 or larger, the values of the weights become very large