In [2]:
!pip install gymnasium



In [3]:
import numpy as np
import gymnasium as gym
import os
import tqdm
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Categorical
from collections import deque
from IPython.display import Image
from matplotlib import animation

In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

cpu


### Initialize the environment

In [5]:
# Initialize the environment
env = gym.make('LunarLander-v3') #,render_mode="human"

state_space = env.observation_space.shape[0]
print('State Space:', state_space)
action_space = env.action_space.n
print('Action Space:', action_space)

State Space: 8
Action Space: 4


  from pkg_resources import resource_stream, resource_exists


### Policy

In [6]:
# Policy Network
class Policy(nn.Module):
    def __init__(self , s_size , a_size , h_size ):
        super (Policy , self ).__init__ ()
        self.fc1 = nn.Linear( s_size , h_size )
        self.fc2 = nn.Linear( h_size , h_size * 2)
        self.fc3 = nn.Linear( h_size * 2, a_size )
    def forward(self , x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim =1)
    def act(self, state ):
        state = torch.from_numpy(state).float().unsqueeze(0)  #.to(device)
        probs = self.forward(state) # .cpu()
        m = Categorical(probs)
        # Random action
        action = m.sample()
        return action.item() , m.log_prob(action)

### REINFORCE
Initialize the policy parameter $\theta$ at random. <br>
**for** each episode $\{s_1, a_1, r_2, s_2, a_2, \dots, s_T\} $ **do** <br>
&nbsp; &nbsp; &nbsp; &nbsp;     **for** $t=1, 2, … , T $ **do** <br>
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; Calculate the Return $G_t$ <br>
&nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; &nbsp; Update policy parameters $\theta \leftarrow \theta + \alpha \gamma^t G_t \nabla_\theta \log \pi_\theta(a_t \vert s_t)$ <br>
&nbsp; &nbsp; &nbsp; &nbsp; **end for** <br>
**end for**

Visually Explained: https://towardsdatascience.com/reinforcement-learning-explained-visually-part-6-policy-gradients-step-by-step-f9f448e73754

In [7]:
# Training Function
def reinforce(
        policy ,
        optimizer ,
        n_training_episodes ,
        max_steps ,
        gamma ,
        print_every
        ):
    # scores_deque = deque(maxlen =100)
    scores = []

    # Each Episode
    for i_episode in range(1, n_training_episodes + 1):
        saved_log_probs = []
        rewards = []
        state = env.reset()[0]

        # t=1, 2, … , T (compute log(policy(a_t|s_t)))
        for t in range(max_steps):
            action , log_prob = policy.act(state)
            saved_log_probs.append(log_prob)
            state , reward , done , _ , _ = env.step(action)
            rewards.append(reward)
            if done :
                break
        # scores_deque.append(sum( rewards ))
        scores.append(sum(rewards))

        returns = deque(maxlen = max_steps)
        n_steps = len(rewards)

        # List of discounted Returns (compute gamma^t*G_t)
        for t in range(n_steps)[:: -1]:
            disc_return_t = returns[0] if len(returns) > 0 else 0
            returns.appendleft(gamma*disc_return_t + rewards[t])

        # Total loss (disc_return = gamma^t*G_t; log_prob = log(policy(a_t|s_t)))
        policy_loss = []
        for log_prob , disc_return in zip( saved_log_probs , returns ):
            policy_loss.append(-log_prob * disc_return )
        policy_loss = torch.cat( policy_loss ).sum()

        optimizer.zero_grad()
        policy_loss.backward()
        optimizer.step()

        print(" Episode {}, Reward : {}".format( i_episode ,sum(rewards)))

    return scores


In [8]:
# Hyperparameter
h_size = 128
lr = 0.001
n_training_episodes = 1000
max_steps = 1000
gamma = 0.99

In [9]:
policy = Policy (
        s_size = state_space ,
        a_size = action_space ,
        h_size = h_size ,
        ) #.to(device)
optimizer = optim.Adam(policy.parameters(), lr=lr)

In [10]:
scores = reinforce (
        policy ,
        optimizer ,
        n_training_episodes ,
        max_steps ,
        gamma ,
        print_every = 100)
# ============== LƯU MODEL SAU KHI TRAIN ==============
# Lưu model weights
torch.save(policy.state_dict(), 'lunarlander_trained.pth')
print(f"\nModel saved to 'lunarlander_trained.pth'")

# Lưu cả model (bao gồm architecture)
torch.save(policy, 'lunarlander_model_full.pth')

import pickle
with open('training_scores.pkl', 'wb') as f:
    pickle.dump(scores, f)

env.close()


print(f"\nTraining completed!")
print(f"Final episode reward: {scores[-1]:.1f}")
print(f"Average last 100 rewards: {np.mean(scores[-100:]):.1f}")

 Episode 1, Reward : -313.89743135909384
 Episode 2, Reward : -306.7752146270318
 Episode 3, Reward : -157.35983715321004
 Episode 4, Reward : -90.63401124173146
 Episode 5, Reward : -321.9582747290135
 Episode 6, Reward : -185.74707515220382
 Episode 7, Reward : -101.04568179628734
 Episode 8, Reward : -82.26916101711504
 Episode 9, Reward : -276.69417497108213
 Episode 10, Reward : -322.7066876592699
 Episode 11, Reward : -351.9781189560763
 Episode 12, Reward : -116.60451670114831
 Episode 13, Reward : -383.4179805458207
 Episode 14, Reward : -200.58093022216576
 Episode 15, Reward : -121.05350268181468
 Episode 16, Reward : -218.82139288479664
 Episode 17, Reward : -243.34504389593602
 Episode 18, Reward : -140.63734265403096
 Episode 19, Reward : -137.56550257350972
 Episode 20, Reward : -169.2029904931115
 Episode 21, Reward : -188.5562789020595
 Episode 22, Reward : -22.327747531158366
 Episode 23, Reward : -64.99005177966146
 Episode 24, Reward : -89.90663593672693
 Episode 25,

KeyboardInterrupt: 

In [None]:

env.close()

# 2. Tạo environment mới với render
env_render = gym.make('LunarLander-v3', render_mode='human')

# 3. Dùng policy đã train để render
print("\n Now rendering trained agent...")

# Chuyển policy sang eval mode
policy.eval()

# Render một vài episodes
for episode in range(10):
    state, _ = env_render.reset()
    total_reward = 0
    done = False
    
    while not done:
        with torch.no_grad():
            action, _ = policy.act(state)
        
        state, reward, terminated, truncated, _ = env_render.step(action)
        total_reward += reward
        done = terminated or truncated
    
    print(f"Rendered Episode {episode+1}: Reward = {total_reward:.1f}")

env_render.close()
print("Rendering completed!")

: 

In [None]:
# Plotting the rewards per episode
import matplotlib.pyplot as plt
plt.plot(scores)
plt.xlabel('Episode')
plt.ylabel('Reward')
plt.title('REINFORCE on LunarLander')
plt.show()

In [None]:
# Cell mới trong notebook
import torch
import numpy as np
import gymnasium as gym
from torch.distributions import Categorical

# Định nghĩa lại Policy class (phải giống y hệt)
class Policy(nn.Module):
    def __init__(self, s_size, a_size, h_size):
        super(Policy, self).__init__()
        self.fc1 = nn.Linear(s_size, h_size)
        self.fc2 = nn.Linear(h_size, h_size * 2)
        self.fc3 = nn.Linear(h_size * 2, a_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return F.softmax(x, dim=1)
    
    def act(self, state):
        state = torch.from_numpy(state).float().unsqueeze(0)
        probs = self.forward(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

# Load model
def load_and_run(model_path='trained_policy.pth', num_episodes=5, render=True):
    # Tạo environment
    render_mode = "human" if render else None
    env = gym.make('LunarLander-v3', render_mode=render_mode)
    
    # Khởi tạo model với đúng kích thước
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.n
    hidden_size = 16  # Phải giống với lúc train
    
    policy = Policy(state_size, action_size, hidden_size)
    
    # Load weights
    policy.load_state_dict(torch.load(model_path))
    policy.eval()  # Chuyển sang evaluation mode
    
    print(f"Model loaded from {model_path}")
    print(f"State size: {state_size}, Action size: {action_size}")
    
    # Chạy episodes
    for episode in range(num_episodes):
        state, _ = env.reset()
        total_reward = 0
        done = False
        step_count = 0
        
        while not done:
            # Lấy action từ policy (không cần log_prob khi eval)
            with torch.no_grad():  # Tắt gradient để tăng tốc
                action, _ = policy.act(state)
            
            # Thực hiện action
            state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
            step_count += 1
            
            # Optional: Hiển thị progress
            if step_count % 100 == 0:
                print(f"  Step {step_count}, Reward: {total_reward:.1f}")
        
        print(f"Episode {episode+1}: Reward = {total_reward:.1f}, Steps = {step_count}")
    
    env.close()
    return policy

# Gọi hàm
policy = load_and_run('trained_policy.pth', num_episodes=3, render=True)

All preserves belong to: https://github.com/hungledut/deep-reinforcement-learning-for-gym-environments/tree/master/lunarlander