In [None]:
import gym
import tensorflow as tf
from keras import Model,Input
from keras.layers import Dense


In [3]:
env=gym.make('CartPole-v1')

In [4]:
from keras.losses import MeanSquaredError
net_input=Input(shape=(4,))
x=Dense(128,activation='relu')(net_input)
x=Dense(128,activation='relu')(x)
output=Dense(2,activation='linear')(x)
q_net=Model(inputs=net_input,outputs=output)
q_net.compile(optimizer='adam')
loss_fun=MeanSquaredError()

In [5]:
from keras.models import clone_model
target_net=clone_model(q_net)

In [6]:
EPSILON=1.0
EPSILON_DECAY=1.005
GAMMA=0.99
NUM_EPISODES=100
MAX_TRANSITIONS=100000
REPLAY_BUFFER=[]
def insert_transition(transition):
  if len(REPLAY_BUFFER)>=MAX_TRANSITIONS:
    REPLAY_BUFFER.pop(0)
  REPLAY_BUFFER.append(transition)

def sample_transitions(batch_size=16):
    random_indices = tf.random.uniform(shape=(batch_size,), minval=0, maxval=len(REPLAY_BUFFER), dtype=tf.int32)

    sampled_current_states = []
    sampled_actions = []
    sampled_rewards = []
    sampled_next_states = []
    sampled_terminals = []

    for index in random_indices:
        # Squeeze the extra dimension from state tensors before appending
        sampled_current_states.append(tf.squeeze(REPLAY_BUFFER[index][0], axis=0))
        sampled_actions.append(REPLAY_BUFFER[index][1])
        sampled_rewards.append(REPLAY_BUFFER[index][2])
        # Squeeze the extra dimension from next_state tensors before appending
        sampled_next_states.append(tf.squeeze(REPLAY_BUFFER[index][3], axis=0))
        sampled_terminals.append(REPLAY_BUFFER[index][4])

    # Convert the lists of correctly shaped tensors to tensors
    return tf.convert_to_tensor(sampled_current_states), tf.convert_to_tensor(sampled_actions), tf.convert_to_tensor(sampled_rewards), tf.convert_to_tensor(sampled_next_states), tf.convert_to_tensor(sampled_terminals)


In [7]:
def policy(state, explore=0.0):
    action = tf.argmax(q_net(state)[0], output_type=tf.int32)
    if tf.random.uniform(shape=(), maxval=1) <= explore:
        # Change maxval to 2 so random actions are 0 or 1
        action = tf.random.uniform(shape=(), minval=0, maxval=2, dtype=tf.int32)
    return action

In [33]:
BATCH_SIZE=64
step_counter=0
TARGET_UPDATE_AFTER=4
def calculate_reward(state):
    reward = -1.0

    # The condition for reward is likely incorrect, as state[3] will likely never be exactly 0.525.
    # A small tolerance should be used instead.
    # Based on typical CartPole environments, a reward of 1.0 is usually given for each step the pole is upright.
    # Assuming the goal is to keep the pole upright, the reward should be 1.0 per step until the episode ends.
    # If the goal is to reach a specific state, the condition below might be relevant, but it's highly unlikely to be met exactly.
    # Let's assume for now that the intent was to check if the pole is "upright" (within a certain angle) and the cart is within bounds.
    # A more standard CartPole reward is +1 for every step. Let's use that for simplicity.
    # If you intended a sparse reward based on the state, you'll need to adjust the condition with tolerances.

    # Standard CartPole reward: +1 for each step
    reward = 1.0
    return reward # Add this line to return the calculated reward


for episode in range(NUM_EPISODES):
    done = False
    total_rewards=0
    episode_length=0
    # Initial state is already correctly shaped (1, 4)
    state = tf.convert_to_tensor([env.reset()])
    while not done:
      # policy expects state with batch dimension
      action=policy(state,EPSILON)
      next_state_numpy,_,done,_=env.step(action.numpy())
      # Pass the next_state numpy array to calculate_reward
      reward=calculate_reward(next_state_numpy)
      # Convert next_state numpy array to tensor and add batch dimension
      next_state_tensor = tf.convert_to_tensor([next_state_numpy], dtype=tf.float32) # Specify dtype for consistency

      # The following lines were causing the indentation error
      # They are now correctly indented within the while loop
      # insert_transition expects state, action, reward, next_state, done
      # Make sure to pass the correctly shaped tensors to insert_transition
      insert_transition([state,action,reward,next_state_tensor,done])

      # Update state with the new, correctly shaped tensor
      state=next_state_tensor
      step_counter += 1
      # Ensure replay buffer has enough transitions before sampling
      if len(REPLAY_BUFFER) >= BATCH_SIZE:
          # sample_transitions should return correctly shaped tensors
          current_states,actions,rewards,next_states,terminals=sample_transitions(BATCH_SIZE)
          next_action_values=tf.reduce_max(target_net(next_states),axis=1)
          targets=tf.where(terminals,rewards,rewards+GAMMA*next_action_values)
          with tf.GradientTape() as tape:
            preds = q_net(current_states)
            batch_nums = tf.range(0, limit=BATCH_SIZE)
            # actions here should be a tensor of shape (BATCH_SIZE,) for tf.stack
            indices = tf.stack((batch_nums, actions), axis=1)
            current_values = tf.gather_nd(preds, indices)
            # Use the loss function object defined earlier
            loss = loss_fun(targets, current_values)
          grads = tape.gradient(loss, q_net.trainable_weights)
          q_net.optimizer.apply_gradients(zip(grads, q_net.trainable_weights))

          if step_counter % TARGET_UPDATE_AFTER == 0:
              target_net.set_weights(q_net.get_weights())

      # These lines also needed correct indentation within the while loop
      total_rewards+=reward
      episode_length+=1
      print("episode",episode,"episode_length",episode_length,"total+rewards",total_rewards,"EPSILON",EPSILON)

    EPSILON /= EPSILON_DECAY # This was outside the loop before, moved inside.
env.close()



episode 0 episode_length 1 total+rewards 1.0 EPSILON 1.0
episode 0 episode_length 2 total+rewards 2.0 EPSILON 1.0
episode 0 episode_length 3 total+rewards 3.0 EPSILON 1.0
episode 0 episode_length 4 total+rewards 4.0 EPSILON 1.0
episode 0 episode_length 5 total+rewards 5.0 EPSILON 1.0
episode 0 episode_length 6 total+rewards 6.0 EPSILON 1.0
episode 0 episode_length 7 total+rewards 7.0 EPSILON 1.0
episode 0 episode_length 8 total+rewards 8.0 EPSILON 1.0
episode 0 episode_length 9 total+rewards 9.0 EPSILON 1.0
episode 0 episode_length 10 total+rewards 10.0 EPSILON 1.0
episode 0 episode_length 11 total+rewards 11.0 EPSILON 1.0
episode 0 episode_length 12 total+rewards 12.0 EPSILON 1.0
episode 1 episode_length 1 total+rewards 1.0 EPSILON 0.9950248756218907
episode 1 episode_length 2 total+rewards 2.0 EPSILON 0.9950248756218907
episode 1 episode_length 3 total+rewards 3.0 EPSILON 0.9950248756218907
episode 1 episode_length 4 total+rewards 4.0 EPSILON 0.9950248756218907
episode 1 episode_leng