In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.autograd
import gym
import numpy as np
import torch.optim as optim
from torch.autograd import Variable
import random
from collections import deque
import sys
import matplotlib.pyplot as plt
import copy

In [2]:
class Actor(nn.Module):
    
    def __init__(self, state_dim, action_dim, max_action):
        super(Actor, self).__init__()

        self.l1 = nn.Linear(state_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, action_dim)

        self.max_action = max_action


    def forward(self, x):
#         x = x.view(x.size(0), -1)
        x = F.relu(self.l1(x))
        x = F.relu(self.l2(x))
        x = self.max_action * torch.tanh(self.l3(x)) 
        return x
    
class Critic(nn.Module):
    def __init__(self, state_dim, action_dim):
        super(Critic, self).__init__()

        # Q1 architecture
        self.l1 = nn.Linear(state_dim + action_dim, 400)
        self.l2 = nn.Linear(400, 300)
        self.l3 = nn.Linear(300, 1)

        # Q2 architecture
        self.l4 = nn.Linear(state_dim + action_dim, 400)
        self.l5 = nn.Linear(400, 300)
        self.l6 = nn.Linear(300, 1)


    def forward(self, x, u):
        xu = torch.cat([x, u], 1)

        x1 = F.relu(self.l1(xu))
        x1 = F.relu(self.l2(x1))
        x1 = self.l3(x1)

        x2 = F.relu(self.l4(xu))
        x2 = F.relu(self.l5(x2))
        x2 = self.l6(x2)
        return x1, x2


    def Q1(self, x, u):
        xu = torch.cat([x, u], 1)
        xu = xu.view(xu.size(0), -1)
        x1 = F.relu(self.l1(xu))
        x1 = F.relu(self.l2(x1))
        x1 = self.l3(x1)
        return x1



In [3]:
class Buffer:
    def __init__(self):
        self.Memory = []
        
    def add(self, state, action, reward, next_state, done):
        step = (state, action, reward, next_state, done)
        self.Memory.append(step)
    
    def get_mini_batch(self, batch_size):
        states = []
        actions = []
        rewards = []
        next_states = []
        dones = []
        
        batch = random.sample(self.Memory, batch_size)
        for state, action, reward, next_state, done in batch:
            states.append(state)
            actions.append(action)
            rewards.append(reward)
            next_states.append(next_state)
            dones.append(done)
        
        return states, actions, rewards, next_states, dones
    
    def get_size(self):
        return len(self.Memory)

In [None]:
class Agent:
    def __init__(self, env, max_action, actor_alpha = 1e-3, critic_alpha = 1e-3 ,gamma = 0.99, tau = 0.005, noise_c = 1.0, d = 2, sigma = 1, batch_size = 100):
        self.env = env
        self.batch_size = batch_size
        self.noise_c = noise_c
        self.actor_alpha = actor_alpha
        self.critic_alpha = critic_alpha
        self.max_action = max_action
        self.d = d
        self.tau = tau
        self.gamma = gamma 
        self.sigma = sigma
        self.current_state = None
        
        #Actor Network
        self.actor = Actor(env.observation_space.shape[0], env.action_space.shape[0], self.max_action)
        self.actor_target = copy.deepcopy(self.actor)
        self.act_opt = optim.Adam(self.actor.parameters(), lr=actor_alpha)
        
        #Critic Network
        self.critic = Critic(env.observation_space.shape[0], env.action_space.shape[0])
        self.critic_target = copy.deepcopy(self.critic)
        self.crit_opt = optim.Adam(self.critic.parameters(), lr=critic_alpha)
        
        #Relay Buffer
        self.replay_buffer = Buffer()
        
    def step(self, time_step):
        action = self.actor.forward(torch.FloatTensor(self.current_state))
        action = (action.detach().numpy().flatten() + np.random.normal(0, self.sigma, self.env.action_space.shape[0]))
        action.clip(self.env.action_space.low, self.env.action_space.high)
        next_state, reward, done, _ = env.step(action)
        print(reward)
        self.replay_buffer.add(self.current_state, action, reward, next_state, done)
        if self.replay_buffer.get_size() > self.batch_size:
            self.update(time_step)
        self.current_state = next_state
        if done is True:
            done, state = False, self.env.reset()
            return reward, True
        return reward, done
   

    def update(self, time_step):
        states, actions, rewards, next_states, dones = self.replay_buffer.get_mini_batch(self.batch_size) 
        
        states = torch.FloatTensor(states)
        actions = torch.FloatTensor(actions)
        next_states = torch.FloatTensor(next_states)
        rewards = torch.FloatTensor(rewards)
        dones = torch.FloatTensor(1-np.array(dones))
        next_actions = self.actor_target.forward(next_states)
        #print(next_actions.detach().numpy().shape)
        
        noise = torch.FloatTensor(actions).data.normal_(0, 1) #tensor of size BATCH_SIZE x ACTION_SIZE
        noise = noise.clamp(-self.noise_c, self.noise_c)
#         print(type(noise), noise.shape)
        next_actions = (next_actions + noise).clamp(-self.max_action, self.max_action)
        #print(next_states.shape, next_actions.shape)
        #update critic
        q_1, q_2 = self.critic_target.forward(next_states, next_actions)
        y = rewards + (self.gamma * torch.min(q_1, q_2)).detach()
        
        current_q1, current_q2 = self.critic.forward(states, actions)
        loss_c = F.mse_loss(y, current_q1) + F.mse_loss(y, current_q2)
        
        self.crit_opt.zero_grad()
        loss_c.backward()
        self.crit_opt.step()
        
        if time_step % self.d == 0:
            #update actor
            action = self.actor.forward(states)
            loss_a = -self.critic.Q1(states, action).mean() #compute loss
            self.act_opt.zero_grad()
            loss_a.backward()
            
            self.act_opt.step()
            self.target_update(self.critic, self.critic_target)
            self.target_update(self.actor, self.actor_target)

    def target_update(self, model, model_target):
        for theta, theta_ in zip(model.parameters(), model_target.parameters()):
            update = self.tau * theta + (1 - self.tau)*theta_
            theta_.data.copy_(update)
            
    def run_episode(self):
        self.current_state = env.reset()
        episode_done = False
        rewards = []
        timesteps = 0
        for timesteps in range(10000):
            reward, episode_done = self.step(timesteps)
            self.env.render() #start the episode
            rewards.append(reward)
            if episode_done is True:
                break

        return rewards
        
        

In [5]:
env = gym.make('BipedalWalker-v2')
current_state = env.reset()
env.observation_space.shape[0] + env.action_space.shape[0]

28

In [6]:
rewards = []
avg_rewards = []
max_action = float(env.action_space.high[0])
BATCH_SIZE = 100
GAMMA = 0.99
TAU = 0.005
NOISE = 0.2
NOISE_CLIP = 0.5
EXPLORE_NOISE = 0.1
rewards = []
env.observation_space.shape[0]
agent = Agent(env, max_action)
for episode in range(50):
    
    episode_rewards = agent.run_episode()
    
    print('Episode '+str(episode) + " avg reward "+ str(np.sum(episode_rewards)))

plt.plot(rewards)
plt.plot(avg_rewards)
plt.plot()
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.show()

-0.2297622591540184
-0.2740298466331524
-0.20103112808073167
-0.18026637118298885
-0.25303377640591096
-0.1374244294896588
-0.05732720965938798
0.0266958860616943
-0.10061432305354355
-0.060447835267726804
0.034651671660803275
-0.19901980878408349
-0.15298361850232217
-0.2888054114813127
-0.2693366413117559
-0.1663716454864888
-0.364226587600416
0.08733632766086749
0.11088830858228507
0.02783165714032662
0.16254986893687287
-0.02823024888445984
0.0647227451479658
0.06982192514754088
0.17047246922130382
0.1288603573139789
0.07261433144987531
-0.2058313108672682
-0.21309257149962268
-0.21685234404237638
-0.2246659401388693
-0.245511359790501
-0.3107298872033085
-0.31279580873040636
-0.2610969136390081
-0.2903402178552307
-0.19772248837819995
-0.1313121311181995
-0.12869555438458216
-0.1177618449568942
-0.177045486826978
-0.2037727174163357
-0.13840896608860082
-0.1411371065308806
-0.08720699058698558
-0.03078910019529176
-0.05273963251347825
-0.03273653781921409
-0.07678804356625601
-0.2



-0.0853865484382535
-0.2039717715505403
-0.17701176114794911
-0.12075855483262338
-0.04342754107815362
-0.010502061648142096
-0.08804079042117877
-0.2568275584893148
-0.142024689560522
-0.10326982083024049
-0.0712046378191995
-0.12968547133715952
-0.129992999759863
-0.14505752344503847
-0.05062956284540988
-0.13584504743695142
-0.08357403295818366
-0.07066429192325382
-0.13466123604911212
-0.0954577992529255
0.04256387839344347
-0.046066403212364514
0.0033144281385271354
-0.0056535096358014715
0.07222551645337202
0.0014538204660341468
0.09065544268054343
0.19070885263198487
0.1297851979036688
0.06369407245249924
0.0959495645874816
0.0650649424688772
0.10288456283120351
0.09639248597621919
0.10801254570331825
0.18092900847446236
0.10420167024778465
-0.1471258197290638
0.042614648917953274
0.07675394486682229
-0.06886937471942209
0.034704770184037476
0.08363922906119016
-0.05064506691437247
-0.0012859377825423027
-0.06693160480143237
-0.32204131125479535
-0.05529429582372083
-0.108446861

0.12798271537274822
0.040070862592996116
-0.08281194967080462
-0.2502412185490188
-0.2650647986316705
-0.25423758028188215
-0.23199729654356233
Episode 4 avg reward -6.5633545015553185
-0.13152942754728317
-0.1536866308934753
-0.10716831341051358
-0.1506156375408475
-0.24377919735594514
-0.29220991077296116
-0.3629989539782219
-0.33507793275244124
-0.31811491656450885
-0.2267650565258425
-0.28071989676694054
-0.2580209117450131
-0.3214756082405587
-0.1090334377992828
-0.09396737312647085
-0.08863828423123896
-0.12433386993408202
-0.07290952454089987
-0.05997757082687066
-0.03560641856327785
0.07752116839538219
0.16250892720942522
0.13600951639103145
0.17862412182489792
0.22283518955273132
0.24606893889880466
0.27832148867123424
0.1370352745944083
-0.3360225407629967
-0.4161930218656578
-0.4183003992040977
-0.3314633536175324
-0.18694860883998418
-0.20232739651828924
-0.3093846687404416
-0.29293573355086105
-0.282817272240683
-0.47188396406173716
-0.4416681528526361
-0.4571973370979417


-0.10459906782016842
-0.21858963092266212
-0.2365423255624059
-0.25224915912394363
-0.15948412681328306
-0.26241794193026563
-0.25491213949521263
-0.08969879406509082
-0.11151492380657459
-100
Episode 9 avg reward -99.90438471663836
-0.15002271647479296
-0.158302053207706
-0.1253735021299172
-0.16554008990416674
-0.15540752462546148
-0.11186853756870747
-0.09286808134447017


KeyboardInterrupt: 

In [None]:
env.close()