In [1]:
import gym
# gym.logger.set_level(40) # suppress warnings (please remove if gives error)
import numpy as np
from collections import deque
import matplotlib.pyplot as plt
%matplotlib inline
import torch 
import torch.nn as nn
import torch.nn.functional as f
import torch.optim as optim
from torch.distributions import Categorical

In [2]:
env = gym.make('MountainCar-v0')#make('LunarLander-v2')#.make('CartPole-v0')
env.seed(0)
print('observation space:', env.observation_space)
print('action space:', env.action_space)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

observation space: Box(2,)
action space: Discrete(3)


In [3]:
def hidden_init(layer):
    fan_in = layer.weight.data.size()[0]
    lim = 1. / np.sqrt(fan_in)
    return (-lim, lim)
class Actor(nn.Module):
    def __init__(self,state_size,action_size):
        super(Actor,self).__init__()
        self.state_size=state_size
        self.action_size=action_size
        self.fc1 = nn.Linear(self.state_size,64)
        self.fc3 = nn.Linear(64,64)
        self.fc2 = nn.Linear(64,self.action_size)
        self.entropy=[]
#         self.fc3.weight.data.uniform_(-1e-2,1e-2)
#         self.fc2.weight.data.uniform_(-1e-2,1e-2)
#         self.fc1.weight.data.uniform_(-1e-2,1e-2)
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc3.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc2.weight.data.uniform_(-1e-3,1e-3)
    def forward(self,state):
        y = f.relu(self.fc1(state))
        y = f.relu(self.fc3(y))
        y = f.softmax(self.fc2(y),dim=-1)
        return y
    def act(self,state):
        state = torch.from_numpy(state).float().unsqueeze(0).to(device)
        probs = self.forward(state).cpu()
        m = Categorical(probs)
        action = m.sample()
        self.entropy.append(m.entropy().detach().numpy())
        return action.item(), m.log_prob(action)

class Critic(nn.Module):
    def __init__(self,state_size):
        super(Critic,self).__init__()
        self.state_size = state_size
        self.fc1 = nn.Linear(self.state_size,64)
        self.fc3 = nn.Linear(64,64)
        self.fc2 = nn.Linear(64,1)
        self.fc1.weight.data.uniform_(*hidden_init(self.fc1))
        self.fc3.weight.data.uniform_(*hidden_init(self.fc2))
        self.fc2.weight.data.uniform_(-3e-3,3e-3)
    def forward(self,state):
        state = torch.from_numpy(state).float().to(device)
        y = f.relu(self.fc1(state))
        y = f.relu(self.fc3(y))
        y = self.fc2(y)
        return y.cpu()
        

In [4]:
import torch.optim as optim
def exp_lr_scheduler(optimizer,decay):
    for param_group in optimizer.param_groups:
        lr = param_group['lr']
        
    lr = lr * decay
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr
    return optimizer,lr
actor = Actor(2,3).to(device)
critic = Critic(2).to(device)
optim_actor = optim.Adam(actor.parameters(), lr=1e-3)
optim_critic = optim.Adam(critic.parameters(),lr=1e-3)

def reinforce(n_episodes=2000, max_t=1000, gamma=0.99, print_every=100):
    scores_deque = deque(maxlen=100)
    scores = []
    for i_episode in range(1, n_episodes+1):
        saved_log_probs = []
        rewards = []
        states=[]
        next_states=[]
        score_list=[]
        while(True):
#         for _ in range(5):
            state_list=[]
            log_prob_list=[]
            reward_list=[]
            state = env.reset()
            state_list.append(state)
            for t in range(max_t):
                action, log_prob = actor.act(state)
                log_prob_list.append(log_prob)
                state, reward, done, _ = env.step(action)
                state_list.append(state)
                reward_list.append(reward)
                if done:
                    break
            saved_log_probs.extend(log_prob_list)
            states.extend(state_list[:-1])
            next_states.extend(state_list[1:])
            r=[0.0]
            for i in reversed(reward_list):
                r.append(i+gamma*r[-1])
            r=r[:0:-1]
            #r = (r - np.mean(r))/(1e-5+np.std(r))
            rewards.extend(r)
            score_list.append(np.sum(reward_list))
            if(len(states)>1024):
                break
        scores_deque.append(np.mean(score_list))
        scores.append(np.mean(score_list))
       
        if(i_episode%1==0):
            for _ in range(2): 
                vstate = critic(np.array(states)).squeeze(-1)
                vstatep = critic(np.array(next_states)).squeeze(-1)
                vlabel= torch.tensor(rewards) + vstatep
                critic_loss = f.mse_loss(vstate,vlabel)
                optim_critic.zero_grad()
                critic_loss.backward()
                torch.nn.utils.clip_grad_norm_(critic.parameters(), 1)
                optim_critic.step()
        baseline = critic(np.array(states)).detach().squeeze(-1).numpy()
        baseline = (baseline - np.mean(baseline))/(np.std(baseline)+1e-5)*np.std(rewards)+np.mean(rewards)
        adv = rewards - baseline
        adv = (adv - np.mean(adv))/(1e-5+np.std(adv)) 
        policy_loss = []
        for log_prob,A in zip(saved_log_probs,adv):
            policy_loss.append(-log_prob * (A))
        policy_loss = torch.cat(policy_loss).sum()/len(score_list)
        optim_actor.zero_grad()
        policy_loss.backward()
        torch.nn.utils.clip_grad_norm_(actor.parameters(), 5)
        
        optim_actor.step()

           
            
        if i_episode % print_every == 0:
            print('Episode {}\tAverage Score: {:.2f}'.format(i_episode, np.mean(scores_deque)))
        if np.mean(scores_deque)>=195.0:
            print('Environment solved in {:d} episodes!\tAverage Score: {:.2f}'.format(i_episode-100, np.mean(scores_deque)))
            break
        
    return scores
        
scores=reinforce()

Episode 100	Average Score: -200.00
Episode 200	Average Score: -200.00
Episode 300	Average Score: -200.00
Episode 400	Average Score: -200.00
Episode 500	Average Score: -200.00
Episode 600	Average Score: -200.00


KeyboardInterrupt: 

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111)
plt.plot(np.arange(1, len(scores)+1), scores)
plt.ylabel('Score')
plt.xlabel('Episode #')
plt.show()