-
Notifications
You must be signed in to change notification settings - Fork 0
/
actor_critic.py
143 lines (113 loc) · 5.66 KB
/
actor_critic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#This Python file aims at implementing a the Advantage Actor-Critic (A2C) approach to solve the CartPole-v0 task of the OpenAI Gym Environment
#Importing all the reqiured frameworks
import numpy as np
import tensorflow as tf
import keras
from keras.layers import Input, Dense
from keras.models import Model
import keras.backend as k
from keras.optimizers import Adam
import matplotlib.pyplot as plt
import random
import gym
#A function to keep track of average rewards
def running_reward_avg(rewards):
output=[]
for i in range(len(rewards)):
output.append(sum(rewards[:i])/(i+1))
return output
#Initializing the Agent class which controls the agents behaviour/improvement in the environment
class Agent(object):
def __init__(self,env,gamma=0.98,alpha=0.01):
self.env=env #The OpenAI Gym Environment
self.state_size=self.env.observation_space.shape[0] #Number of features describing each state in the environment
self.action_size=self.env.action_space.n #Number of features describing each action in the environment
self.gamma=gamma #Discount factor for future rewards
self.alpha=alpha #Learning rate during training
self.n_hl1=16 #Number of units in the first hidden layer of the network
self.n_hl2=16 #Number of units in the second hidden layer of the network
self.actor,self.critic,self.policy=self.build_network() #Building the networks that output different output layers
self.reward_history=[] #Reward history to keep track of rewards per episode
self.episode_lengths=[] #To keep track of the length of each episode
#Defining a function that builds the actor, critic, and policy network to train/improve the model
def build_network(self):
inputs=Input(shape=[self.state_size])
advantage=Input(shape=[1])
X=Dense(self.n_hl1,activation='relu')(inputs)
X=Dense(self.n_hl2,activation='relu')(X)
actor_layer=Dense(self.action_size,activation='softmax')(X)
critic_layer=Dense(1)(X)
#Defining a custom loss function (in the keras format) for the policy gradient loss
def custom_loss(y_pred,y_true):
probs=k.clip(y_pred,1e-10,1-1e-10)
return -k.mean(k.log(probs)*y_true)*advantage
#The policy network takes the state and reward obtained, to calculate probabilities for each action(stochastic policy)
actor_model=Model(inputs=[inputs,advantage],outputs=actor_layer)
actor_model.compile(optimizer=Adam(learning_rate=self.alpha),loss=custom_loss)
#The critic network takes the state, and calculates the value of that state
critic_model=Model(inputs=inputs,outputs=critic_layer)
critic_model.compile(optimizer=Adam(learning_rate=self.alpha),loss='mean_squared_error')
#The policy network is like the actor network, but doesnt take any reward, and just predicts the stochastic probabilities
#Training is not done on this network, but on the actor network instead
policy_model=Model(inputs=inputs,outputs=actor_layer)
return actor_model,critic_model,policy_model
#Choosing a greedy action, except certain times randomly (randomness can be modified with changing epsilon)
def choose_action(self,state,epsilon=0.2):
policy=self.policy.predict(state.reshape([1,self.state_size]))
if np.random.rand(1)<epsilon:
return np.argmax(policy[0])
else:
return np.random.randint(self.action_size)
#Updating the actor-critic networks after every timestep in an episode
def update_network(self,state_now,action,reward,state_next):
state_now=state_now.reshape([1,self.state_size])
state_next=state_next.reshape([1,self.state_size])
action=(np.eye(self.action_size)[action]).reshape([1,self.action_size])
state_now_value=self.critic.predict(state_now)
state_next_value=self.critic.predict(state_next)
target_value_now=reward+self.gamma*state_next_value
advantage=target_value_now-state_now_value
self.actor.fit([state_now,advantage],action)
self.critic.fit(state_now,target_value_now)
#Training the agent over a number of episodes, so it learns the policy to claim maximum reward
def train(self,num_episodes=5000):
#Iterating over 'num_episodes'
for i in range(num_episodes):
#Maintaining a buffer to store the reward obtained throughout each episode
reward_buffer=0
#Maintaining the count of the length of the episode
j=0
#Resetting the starting state of each episode
state_now=env.reset()
while True:
#Choosing an action as per policy, and going to the next state, claiming reward
action=agent.choose_action(state_now)
state_next,reward,done,_=env.step(action)
#Incrementing the reward buffer, and step count of the episode
reward_buffer+=reward
j+=1
#Updating the networks
agent.update_network(state_now,action,reward,state_next)
#Seeing if the episode terminates
if done==True:
self.reward_history.append(reward_buffer)
self.episode_lengths.append(j)
if (i+1)%100==0:
print("Length of Episode {} : {}".format(i+1,self.episode_lengths[-1]))
print("Total reward claimed by the agent in episode {} : {}".format(i+1,self.reward_history[-1]))
break
else:
#Else setting the next timestep's state
state_now=state_next
#Creating an environment, and an agent
env=gym.make("CartPole-v1")
agent=Agent(env)
#Training the agent using Deep Q-Learning with experience replay with the above mentioned parameters
agent.train()
#Plotting the results
fig, axs = plt.subplots(1,2)
axs[0].plot(running_reward_avg(agent.reward_history))
axs[0].set_title('Average Reward per Episode')
axs[1].plot(agent.episode_lengths, 'tab:orange')
axs[1].set_title('Episode_Length')
plt.show()