In [1]:
import gym
import numpy as np
import tensorflow as tf
from collections import deque, namedtuple
from tensorflow import keras
import matplotlib.pyplot as plt


In [2]:
class Model(keras.Model):
    def __init__(self, action_size) -> None:
        super(Model, self).__init__()
        self.layer1 = keras.layers.Dense(64, activation = 'relu', input_shape = (8,))
        self.layer2 = keras.layers.Dense(64, activation = 'relu')
        
        self.actor = keras.layers.Dense(action_size, activation = 'softmax')
        self.critic = keras.layers.Dense(1)
        
    def call(self,input):
        
        x = self.layer1(input)
        x = self.layer2(x)
        
        #Action Distribution
        actor = self.actor(x)
        
        #Value of rollout of rewards
        critic = self.critic(x)
        
        return actor, critic

In [3]:
# model = Model(4)

In [4]:
# model_t = Model(4)

In [5]:
# model.compile(optimizer=tf.keras.optimizers.Adam())

In [6]:
# model_t.compile(optimizer=tf.keras.optimizers.Adam())

In [7]:
# model.build((100,8))

In [8]:
# model_t.build((100,8))

In [9]:
# model.summary()
# 

In [10]:
# model_t.weights

In [11]:
# model.weights

In [12]:
class create_models_and_update:
    def __init__(self):    
        self.model = Model(4)

        self.model_t = Model(4)

        self.model.compile(optimizer=tf.keras.optimizers.Adam())

        self.model_t.compile(optimizer=tf.keras.optimizers.Adam())

        self.model.build((100,8))

        self.model_t.build((100,8))

        print("before:", self.model_t.trainable_weights)
        self.update_network_params()

        print("after:", self.model_t.trainable_weights)
    def update_network_params(self, tau = 1):
    
        #Updating actor
        target_weights = self.model_t.weights
        weights = []

        for i, weight in enumerate(self.model.weights):
            weights.append(weight * tau + target_weights[i] * (1-tau))
        self.model_t.set_weights(weights)
    
    

In [13]:
_ = create_models_and_update()

before: [<tf.Variable 'dense_4/kernel:0' shape=(8, 64) dtype=float32, numpy=
array([[ 5.30423224e-02,  6.42126203e-02,  8.58258903e-02,
         5.72972298e-02,  2.25334167e-01,  1.63631141e-01,
        -1.90453678e-01,  1.25313312e-01, -7.63839483e-03,
        -2.11769372e-01, -2.57478803e-01,  9.00007784e-02,
         9.98694003e-02,  1.84102058e-02, -1.72571853e-01,
         3.80605459e-04, -1.15194842e-01, -4.79197651e-02,
         1.62074924e-01,  2.50873864e-02,  4.19529378e-02,
         1.18102580e-01, -5.53461760e-02, -3.31768692e-02,
        -1.55468509e-01,  1.24839514e-01, -2.01780587e-01,
        -1.33213162e-01,  2.13398814e-01, -1.08785063e-01,
        -2.30025887e-01,  1.55207664e-01,  5.29847741e-02,
        -2.24587977e-01,  6.19491935e-03,  6.17728829e-02,
         1.97817683e-01, -2.37289250e-01, -2.86281109e-01,
        -2.08775580e-01, -1.93808168e-01, -1.32295370e-01,
         1.70763463e-01, -1.19133249e-01,  2.35864103e-01,
         2.28222251e-01, -2.73875087e-

2022-06-10 17:59:42.120778: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-10 17:59:42.135278: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-10 17:59:42.136051: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-10 17:59:42.137237: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags

In [None]:
_.mode

In [27]:
update_network_params(model, model_t)

In [28]:
model_t.weights

[<tf.Variable 'dense_4/kernel:0' shape=(8, 64) dtype=float32, numpy=
 array([[-1.78409368e-01,  1.74620658e-01, -1.37297124e-01,
         -2.32565060e-01,  1.80435389e-01, -1.61415175e-01,
         -7.25007206e-02, -1.99660122e-01, -4.25228179e-02,
         -4.47920561e-02, -3.50306034e-02,  8.38837922e-02,
          4.50654924e-02,  1.93145663e-01, -2.77315080e-02,
         -1.02682844e-01,  1.55910164e-01,  3.64013314e-02,
          6.69248998e-02, -1.79341435e-02, -5.54833412e-02,
         -9.81032848e-03,  1.08532816e-01,  2.71242321e-01,
          3.26378047e-02, -1.41293958e-01, -2.45975778e-01,
         -1.08875558e-01,  3.26019526e-03,  1.29049152e-01,
          5.10337055e-02,  2.53123999e-01,  2.08393276e-01,
         -6.62306547e-02, -1.94245577e-02, -1.37450680e-01,
          2.82563329e-01, -2.52420604e-01,  2.67282307e-01,
         -9.44787115e-02,  2.69042671e-01, -3.51796746e-02,
         -9.28390771e-02,  1.24048293e-01,  2.56098211e-01,
          6.75598085e-02, -1.13

In [31]:
model.weights.all() == model_t.weights.all()

AttributeError: 'list' object has no attribute 'all'

In [3]:
env = gym.make("LunarLander-v2", continuous = True)

In [4]:
env.action_space.sample()

array([0.1568443 , 0.95904315], dtype=float32)

In [8]:
env.action_space.high[0]

1.0

In [9]:
env.action_space.low[0]

-1.0

In [6]:
env.observation_space

Box(-inf, inf, (8,), float32)

In [8]:
env.action_space.shape[0]

2

In [7]:
for i in range(10):
    state = env.reset()
    while True:
        action = env.action_space.sample()
        env.render()
        state_p, reward, done, _ = env.step(action)
        if done:
            break

In [33]:
env.close()

In [54]:
space = env.reset()

In [55]:
space

array([ 0.00311222,  1.3986368 ,  0.3152082 , -0.54591626, -0.0035994 ,
       -0.07139947,  0.        ,  0.        ], dtype=float32)

In [56]:
state = tf.convert_to_tensor(space)

In [57]:
state = tf.convert_to_tensor(state)
state = tf.reshape(state, (1, len(state)))

action_probs, critic_val = model(state)

In [58]:
action_probs

<tf.Tensor: shape=(1, 4), dtype=float32, numpy=array([[0.31890666, 0.17685933, 0.24910223, 0.25513178]], dtype=float32)>

In [59]:
critic_val

<tf.Tensor: shape=(1, 1), dtype=float32, numpy=array([[0.25513676]], dtype=float32)>

In [60]:
action = np.random.choice(env.action_space.n, p = np.squeeze(action_probs))

In [61]:
action

1

In [45]:
np.squeeze(action_probs)

array([0.24676606, 0.25266385, 0.25108433, 0.24948576], dtype=float32)

In [46]:
env.action_space.n

2

In [63]:
action_probs_episodes = []
critic_val_episodes = []


In [70]:
episode_reward = 0

In [67]:
rewards_history = []

In [64]:
action_probs_episodes.append(tf.math.log(action_probs[0, action]))

In [65]:
state, reward, done, _ = env.step(action)

In [66]:
state

array([ 0.0061492 ,  1.385767  ,  0.30536947, -0.57200074, -0.00524055,
       -0.03282565,  0.        ,  0.        ], dtype=float32)

In [71]:
rewards_history.append(reward)
episode_reward += reward