In [1]:
import numpy as np
import torch as T
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import gymnasium as gym
import matplotlib.pyplot as plt

In [15]:
class PolicyNetwork(nn.Module):
    def __init__(self, lr, input_dims, n_actions):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(*input_dims, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, n_actions)
        self.optimizer = optim.Adam(self.parameters(), lr=lr)

        self.device = T.device('cuda:0' if T.cuda.is_available() else 'cpu')
        self.to(self.device)

    def forward(self, state):
        x = F.relu(self.fc1(state))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)

        return x


In [16]:
class PolicyGradientAgent():
    def __init__(self, lr, input_dims, gamma=0.99, n_actions=4):
        self.gamma = gamma
        self.lr = lr
        self.reward_memory = []
        self.action_memory = []

        self.policy = PolicyNetwork(self.lr, input_dims, n_actions)

    def choose_action(self, observation):
#         state = T.Tensor([observation]).to(self.policy.device)
        state = T.Tensor(np.array([observation])).to(self.policy.device)
        probabilities = F.softmax(self.policy.forward(state))
        action_probs = T.distributions.Categorical(probabilities)
        action = action_probs.sample()
        log_probs = action_probs.log_prob(action)
        self.action_memory.append(log_probs)

        return action.item()

    def store_rewards(self, reward):
        self.reward_memory.append(reward)

    def learn(self):
        self.policy.optimizer.zero_grad()

        # G_t = R_t+1 + gamma * R_t+2 + gamma**2 * R_t+3
        # G_t = sum from k=0 to k=T {gamma**k * R_t+k+1}
        G = np.zeros_like(self.reward_memory, dtype=np.float64)
        for t in range(len(self.reward_memory)):
            G_sum = 0
            discount = 1
            for k in range(t, len(self.reward_memory)):
                G_sum += self.reward_memory[k] * discount
                discount *= self.gamma
            G[t] = G_sum
        G = T.tensor(G, dtype=T.float).to(self.policy.device)
        
        loss = 0
        for g, logprob in zip(G, self.action_memory):
            loss += -g * logprob
        loss.backward()
        self.policy.optimizer.step()

        self.action_memory = []
        self.reward_memory = []


In [17]:
def plot_learning_curve(scores, x, figure_file):
    running_avg = np.zeros(len(scores))
    for i in range(len(running_avg)):
        running_avg[i] = np.mean(scores[max(0, i-100):(i+1)])
    plt.plot(x, running_avg)
    plt.title('Running average of previous 100 scores')
    plt.savefig(figure_file)

In [None]:
env = gym.make('LunarLander-v2')
n_games = 3000
agent = PolicyGradientAgent(gamma=0.99, lr=0.0005, input_dims=[8],
                            n_actions=4)

fname = 'REINFORCE_' + 'lunar_lunar_lr' + str(agent.lr) + '_' \
        + str(n_games) + 'games'
figure_file = 'plots/' + fname + '.png'

scores = []
for i in range(n_games):
    done = False
    observation = env.reset()[0]
    score = 0
    iter = 0
    while not done and iter < 5000:
        action = agent.choose_action(observation)
        observation_, reward, done, trunc, info = env.step(action)
        score += reward
        agent.store_rewards(reward)
        observation = observation_
        iter += 1
    agent.learn()
    scores.append(score)

    avg_score = np.mean(scores[-100:])
    if i % 100 == 0:
        print('episode ', i, 'score %.2f' % score,
                'average score %.2f' % avg_score)

x = [i+1 for i in range(len(scores))]
plot_learning_curve(scores, x, figure_file)

  probabilities = F.softmax(self.policy.forward(state))


episode  0 score -107.84 average score -107.84
episode  100 score -135.69 average score -181.55
episode  200 score -182.17 average score -166.35
episode  300 score -199.41 average score -169.22
episode  400 score -90.66 average score -145.82
episode  500 score -96.68 average score -123.89
episode  600 score -179.54 average score -129.77
episode  700 score -18.99 average score -125.17
episode  800 score -89.31 average score -111.80
episode  900 score -58.39 average score -119.64
episode  1000 score -64.61 average score -107.15
episode  1100 score -18.82 average score -97.08
episode  1200 score -110.96 average score -83.71
episode  1300 score -83.60 average score -88.67
episode  1400 score -74.92 average score -56.74
episode  1500 score -47.85 average score -42.03
episode  1600 score -34.86 average score -32.74
episode  1700 score 17.21 average score -17.84
episode  1800 score -18.45 average score 10.75
episode  1900 score -24.45 average score 58.24
episode  2000 score 61.68 average scor