In [1]:
import random
import gym
import numpy as np
import tensorflow as tf

In [2]:
env = gym.make('CartPole-v1')

In [3]:
print("number of actions = {}".format(env.action_space.n))
print("state dimention = {}".format(env.observation_space.shape[0]))
nActions = env.action_space.n
stateSize = env.observation_space.shape[0]

number of actions = 2
state dimention = 4


# Linear Model

In [4]:
def run_episode(env, parameters):
    done = False
    state = env.reset()
    state.reshape([1, stateSize])
    total_rewards = 0
    time_step = 0
    while not done and time_step < 200:
#         action = np.random.randint(0, nActions)
        action = 1
        if np.matmul(state, parameters) < 0:
            action = 0
#         env.render()
        time_step += 1
        next_state, reward, done, info = env.step(action)
        total_rewards += reward
        state = np.reshape(next_state, [1, stateSize])
    return total_rewards, time_step
#     print "episode {}".format(ep)
#     print "\t total reward = {}".format(total_rewards)
#     print "\t time step = {}".format(time_step)

In [6]:
best_params = None
best_reward = 0
for episode in range(1000):
    params = np.random.rand(stateSize, 1) * 2 - 1
    G, T = run_episode(env, params)
    print(episode, G)
    if G > best_reward:
        best_reward = G
        best_params = params
        if G >= 200:
            break

0 88.0
1 25.0
2 10.0
3 43.0
4 9.0
5 11.0
6 71.0
7 10.0
8 25.0
9 78.0
10 8.0
11 200.0


## Hill Climbing

In [8]:
noise_scaling = .1
params = np.random.rand(4, 1) * 2 - 1
best_reward = 0
for episode in range(1000):
    new_params = params + (np.random.rand(4, 1) * 2 - 1) * noise_scaling
    G, T = run_episode(env, new_params)
    print(episode, G)
    if G > best_reward:
        best_reward = G
        params = new_params
        if G >= 200:
            break

0 53.0
1 68.0
2 92.0
3 72.0
4 47.0
5 54.0
6 65.0
7 50.0
8 38.0
9 49.0
10 38.0
11 42.0
12 58.0
13 75.0
14 38.0
15 48.0
16 44.0
17 48.0
18 34.0
19 50.0
20 80.0
21 36.0
22 106.0
23 56.0
24 56.0
25 65.0
26 57.0
27 46.0
28 38.0
29 162.0
30 44.0
31 46.0
32 40.0
33 169.0
34 48.0
35 88.0
36 45.0
37 102.0
38 48.0
39 77.0
40 35.0
41 75.0
42 77.0
43 43.0
44 58.0
45 56.0
46 65.0
47 122.0
48 108.0
49 45.0
50 58.0
51 62.0
52 66.0
53 57.0
54 51.0
55 94.0
56 77.0
57 72.0
58 90.0
59 50.0
60 89.0
61 53.0
62 48.0
63 93.0
64 38.0
65 42.0
66 48.0
67 49.0
68 73.0
69 131.0
70 69.0
71 56.0
72 145.0
73 39.0
74 89.0
75 37.0
76 40.0
77 55.0
78 50.0
79 106.0
80 70.0
81 118.0
82 70.0
83 46.0
84 49.0
85 60.0
86 175.0
87 68.0
88 70.0
89 39.0
90 42.0
91 39.0
92 40.0
93 70.0
94 108.0
95 93.0
96 40.0
97 67.0
98 63.0
99 74.0
100 62.0
101 35.0
102 37.0
103 67.0
104 49.0
105 70.0
106 127.0
107 100.0
108 117.0
109 66.0
110 44.0
111 65.0
112 42.0
113 51.0
114 73.0
115 70.0
116 85.0
117 55.0
118 48.0
119 42.0
120 59.0
121 68

In [10]:
for episode in range(20):
    G, T = run_episode(env, params)
    print(episode, G)

0 51.0
1 47.0
2 75.0
3 67.0
4 66.0
5 99.0
6 85.0
7 61.0
8 91.0
9 48.0
10 112.0
11 50.0
12 49.0
13 124.0
14 73.0
15 60.0
16 87.0
17 100.0
18 112.0
19 49.0


# Policy Gradient

In [15]:
def policy_gradient():
    params = tf.get_variable('policy_params', [stateSize, nActions])
    state = tf.placeholder('float', [None, stateSize])
    linear = tf.matmul(state, params)
    probs = tf.nn.softmax(linear)
    actions = tf.placeholder('float', [None, nActions]) # one-hot encoding for actions
    probs = tf.multiply(probs, actions)
    probs = tf.reduce_sum(probs, reduction_indices=[1])
    log_probs = tf.log(probs)
    loss = -tf.reduce_sum(log_probs)
    optimizer = tf.train.AdamOptimizer(.01).minimize(loss)

In [16]:
def value_gradient():
    state = tf.placeholder('float', [None, stateSize])
    w1 = tf.get_variable('w1', [stateSize, 10])
    b1 = tf.get_variable('b1', [10])
    h1 = tf.nn.relu(tf.matmul(state, w1) + b1)
    w2 = tf.get_variable('w2', [10, 1])
    b2 = tf.get_variable('b2', [1])
    pred = tf.matmul(h1, w2) + b2
    vals = tf.placeholder('float', [None, 1])
    deltas = pred - vals
    loss = tf.nn.l2_loss(deltas)
    optimizer = tf.train.AdadeltaOptimizer(.1).minimize(loss)

In [19]:
pp, ps = policy_gradient()
state = env.reset()
actions = []
for t in range(200):
    state = np.expand_dims(state, axis=0)
    probs = sess.run(pp, feeddict={ps:state})
    action = 0 if random.rand() < probs[0][0] else 1
    next_state, reward, done, info = env.step(action)
    total_reward += reward
    if done:
        break

TypeError: 'NoneType' object is not iterable

In [20]:
def policy_gradient():  
    params = tf.get_variable("policy_parameters",[4,2])
    state = tf.placeholder("float",[None,4])
    actions = tf.placeholder("float",[None,2])
    advantages = tf.placeholder("float",[None,1])
    linear = tf.matmul(state,params)
    probabilities = tf.nn.softmax(linear)
    good_probabilities = tf.reduce_sum(tf.multiply(probabilities, actions),reduction_indices=[1])
    # maximize the log probability
    log_probabilities = tf.log(good_probabilities)
    # insert the elementwise multiplication by advantages
    eligibility = log_probabilities * advantages
    loss = -tf.reduce_sum(eligibility)
    optimizer = tf.train.AdamOptimizer(0.01).minimize(loss)

In [21]:
pl_probabilities, pl_state = policy_gradient()  


ValueError: Variable policy_parameters already exists, disallowed. Did you mean to set reuse=True or reuse=tf.AUTO_REUSE in VarScope? Originally defined at:

  File "<ipython-input-18-9498856638f2>", line 2, in policy_gradient
    params = tf.get_variable("policy_parameters",[4,2])
  File "<ipython-input-19-930d0e8d230a>", line 1, in <module>
    pp, ps = policy_gradient()
  File "/home/wangxing/.local/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3265, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
