<a href="https://colab.research.google.com/github/Varunsaistark/reinforcment_learning/blob/main/Td3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch 
import torch.nn as nn
import gym
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
from collections import namedtuple
import gym
import random
import matplotlib.pyplot as plt

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
Experience = namedtuple('Experience',['state','next_state','action','reward','done'])

In [4]:
class replaybuffer():
  def __init__(self,capacity):
    self.memory = []
    self.capacity = capacity
    self.count = 0
  def push(self,x):
    if self.capacity>self.count:
      self.memory.append(x)
    else:
      self.memory[self.count%self.capacity] = x
    self.count+=1
  def sample(self,batch_size):
    e= random.sample(self.memory,batch_size)
    states = torch.tensor([e1.state for e1 in e]).to(device)
    actions = torch.tensor([e1.action for e1 in e]).to(device)
    next_states = torch.tensor([e1.next_state for e1 in e]).to(device)
    rewards = torch.tensor(([e1.reward for e1 in e])).to(device)
    dones = torch.tensor([e1.done for e1 in e]).to(device)

    return states.float(),actions.float(),next_states.float(),rewards,dones.int()
  def is_available(self,batch_size):
    return batch_size<=self.count

In [5]:
class Critic(nn.Module):
  def __init__(self,lr1,input_dims,fc1,fc2,num_actions,chkpt_dir = ''):
    super(Critic,self).__init__()
    self.input_dims = input_dims
    self.ck = chkpt_dir
    self.num_actions = num_actions
    self.fc1 = nn.Linear(self.input_dims+self.num_actions,fc1)
    self.fc2 = nn.Linear(fc1,fc2)
    self.fc3 = nn.Linear(fc2,1)
    self.optimiser = optim.Adam(self.parameters(),lr= lr1)
    self.to(device)
  def forward(self,state,action):
    out = self.fc1(torch.cat([state,action], dim=1))
    out = F.relu(out)
    out = self.fc2(out)
    out = F.relu(out)
    out = self.fc3(out)
    return out
  def save_checkpoints(self):
    print('saving checkpoints ....')
    torch.save(self.state_dict(),self.ck)
    print('checkpoint saved .. ')
  def load_ck(self):
    self.load_state_dict(torch.load(self.ck))
    print('ck loaded..')

In [6]:
class Actor(nn.Module):
  def __init__(self,fc1,lr2,fc2,ck_dir,num_actions,input_dims):
    super(Actor,self).__init__()
    self.i = input_dims
    self.dir = ck_dir
    self.num_actions = num_actions
    self.fc1 = nn.Linear(self.i,fc1)
    self.fc2 = nn.Linear(fc1,fc2)
    self.fc3 = nn.Linear(fc2,self.num_actions)
    self.optimiser = optim.Adam(self.parameters(),lr = lr2)
    self.to(device)
  def forward(self,state):
    out = self.fc1(state)
    out = F.relu(out)
    out = self.fc2(out)
    out = F.relu(out)
    out = torch.tanh(self.fc3(out))
    return out
  def save_checkpoints(self):
    print('saving checkpoints ....')
    torch.save(self.state_dict(),self.dir)
    print('checkpoint saved .. ')
  def load_ck(self):
    self.load_state_dict(torch.load(self.dir))
    print('ck loaded..')


In [31]:
class Agent():
  def __init__(self,lr1,lr2,input_dims,tau,env,
               gamma = 0.99,update_actor_interval =2,
               warmup = 1000,n_actions =1,max_size = 10000,
               fc1 =400,fc2 =300,batch_size = 64,noise =0.1):
    self.gamma = torch.tensor(gamma).to(device)
    self.tau =tau
    self.max_action = env.action_space.high
    self.min_action = env.action_space.low
    self.memory = replaybuffer(max_size)
    self.batch_size = batch_size
    self.learn_step = 0
    self.time_step = 0
    self.warmup = warmup
    self.num_actions = n_actions
    self.updateinterval = update_actor_interval
    self.file0 = '/content/drive/MyDrive/chrome_dino/lander_actortd3'
    self.file1 = '/content/drive/MyDrive/chrome_dino/lander_critic1td3'
    self.file2 = '/content/drive/MyDrive/chrome_dino/lander_critic2td3'
    self.file3 = '/content/drive/MyDrive/chrome_dino/targetlander_actortd3'
    self.file4 = '/content/drive/MyDrive/chrome_dino/targetlander_critic1td3'
    self.file5 = '/content/drive/MyDrive/chrome_dino/targetlander_critic2td3'
   
    
    self.actor = Actor(fc1,lr2,fc2,self.file0,self.num_actions,input_dims)
    self.critic1 = Critic(lr1,input_dims,fc1,fc2,self.num_actions,self.file1)
    self.critic2 = Critic(lr1,input_dims,fc1,fc2,self.num_actions,self.file2)
    self.targetactor = Actor(fc1,lr2,fc2,self.file3,self.num_actions,input_dims)
    self.targetcritic1 = Critic(lr1,input_dims,fc1,fc2,self.num_actions,self.file4)
    self.targetcritic2 = Critic(lr1,input_dims,fc1,fc2,self.num_actions,self.file5)
    self.noise  =noise

  def update_network_parameters(self,tau=1):
    for targetcritic1param,localparam in zip(self.targetcritic1.parameters(),self.critic1.parameters()):
      targetcritic1param.data.copy_(tau*localparam.data+(1-tau)*targetcritic1param.data)
    
    for targetcritic2param,localparam in zip(self.targetcritic2.parameters(),self.critic2.parameters()):
       targetcritic2param.data.copy_(tau*localparam.data+(1-tau)*targetcritic2param.data)
    
    for targetactorparam,localparam in zip(self.targetactor.parameters(),self.actor.parameters()):
      targetactorparam.data.copy_(tau*localparam.data+(1-tau)*targetactorparam.data)

  #update_network_parameters(1)
  
  def choose_action(self,observation):
    if self.time_step < self.warmup:
      action = torch.tensor(np.random.normal(scale=self.noise,
                                              size = (self.num_actions,)))
    else:
      state = torch.tensor(observation,dtype = torch.float).to(device)
      action = self.actor.forward(state).to(device)
    action_noise = action + torch.tensor(np.random.normal(scale =self.noise),
                                          dtype = torch.float).to(device)
    action_noise = torch.clamp(action_noise,self.min_action[0],self.max_action[0])
    self.time_step +=1

    return action_noise.cpu().detach().numpy()
    
  def store(self,e):
    self.memory.push(e)
  
  
    
  def learn(self):
    if self.memory.is_available(self.batch_size):
      state,action,next_state,reward,done = self.memory.sample(self.batch_size)
      target_action = self.targetactor.forward(next_state)
      target_action += torch.clamp(torch.tensor(np.random.normal(scale=0.2)),-0.5,0.5)
      target_action = torch.clamp(target_action,self.min_action[0],self.max_action[0])
    #see here error might come then convert done to int
      
      q1_ = self.targetcritic1.forward(next_state,target_action).view(64)
      q2_ = self.targetcritic2.forward(next_state,target_action).view(64)
      q1_ = q1_*(1-done)
      q2_ = q2_*(1-done)
      target_q = torch.min(q1_,q2_)
      q1 = self.critic1.forward(state,action)
      q2 = self.critic2.forward(state,action)
      q = torch.min(q1,q2)
      target = reward+self.gamma*target_q
      target = target.view(self.batch_size,1).float().detach()
      self.critic1.optimiser.zero_grad()
      self.critic2.optimiser.zero_grad()

      q1_loss = F.mse_loss(target,q1)
      q2_loss = F.mse_loss(target,q2)

      loss = q1_loss.float()+q2_loss.float()
      loss.backward()
      self.critic1.optimiser.step()
      self.critic2.optimiser.step()
      self.learn_step+=1

      if self.learn_step%self.updateinterval!=0:
        return 
      else:
        self.actor.optimiser.zero_grad()
        actor_loss = self.critic1(state,self.actor.forward(state))
        actor_loss = -torch.mean(actor_loss)
        actor_loss.backward()
        self.actor.optimiser.step()
      
        self.update_network_parameters(self.tau)
    else:
      return 

    

  def save_model(self):
    self.actor.save_checkpoints()
    self.targetactor.save_checkpoints()
    self.critic1.save_checkpoints()
    self.critic2.save_checkpoints()
    self.targetcritic1.save_checkpoints()
    self.targetcritic2.save_checkpoints()
    
  def load_model(self):
    self.actor.load_ck()
    self.targetactor.load_ck()
    self.critic1.load_ck()
    self.critic2.load_ck()
    self.targetcritic1.load_ck()
    self.targetcritic2.load_ck()







      




In [32]:
env = gym.make('Pendulum-v0')
agent = Agent(0.001,0.001,3,0.005,env)
agent.update_network_parameters()
num_episodes = 10000
rewards=[]
best_score =0
for i in range(num_episodes):
  observation = env.reset()
  done=False
  score= 0
  
  while not done:
    action = agent.choose_action(observation)
    next_state,reward,done,info = env.step(action)
    e1 = Experience(observation,next_state,action,reward,done)
    agent.store(e1)
    agent.learn()
    score+=reward 
    observation = next_state
  rewards.append(score)
  if i==0:
    best_score = score
  avg_score = np.mean(rewards[-100:])
  if best_score<score:
    best_score = score
    agent.save_model()
  print('Episode {} best score till now {} average score for the past 100ep {}'.format(i,best_score,avg_score))



Episode 0 best score till now -861.904121418747 average score for the past 100ep -861.904121418747
Episode 1 best score till now -861.904121418747 average score for the past 100ep -1362.4873401005636
Episode 2 best score till now -861.904121418747 average score for the past 100ep -1465.9846600906776
Episode 3 best score till now -861.904121418747 average score for the past 100ep -1535.780397293922
Episode 4 best score till now -861.904121418747 average score for the past 100ep -1511.3878514101477
Episode 5 best score till now -861.904121418747 average score for the past 100ep -1502.9660485729858
Episode 6 best score till now -861.904121418747 average score for the past 100ep -1529.2738829290777
Episode 7 best score till now -861.904121418747 average score for the past 100ep -1534.6745484081534
Episode 8 best score till now -861.904121418747 average score for the past 100ep -1541.4753472015325
Episode 9 best score till now -861.904121418747 average score for the past 100ep -1560.5204538

KeyboardInterrupt: ignored

In [9]:
re = np.array([1,2,3,4,5])
np.mean(re[-100:])

3.0

In [None]:
!sudo apt-get install build-essential python-dev swig python-pygame
!pip install box2d-py