# Phase 9: Reinforcement Learning
## Часть 2: Policy Gradient Methods

### В этом ноутбуке:

1. **Policy Gradient** - прямая оптимизация policy
2. **REINFORCE** - Monte Carlo policy gradient
3. **Actor-Critic** - комбинация policy и value
4. **Baseline** - уменьшение variance

In [None]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions import Categorical
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

torch.manual_seed(42)
np.random.seed(42)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Device: {device}')

## 1. Policy Gradient Theorem

### Идея

Вместо оценки Q-функции, напрямую оптимизируем policy $\pi_\theta(a|s)$.

### Policy Gradient Theorem

$$\nabla_\theta J(\theta) = \mathbb{E}_{\pi_\theta}[\nabla_\theta \log \pi_\theta(a|s) \cdot Q^{\pi_\theta}(s,a)]$$

### Преимущества:

- Работает с непрерывными действиями
- Может обучать стохастические policy
- Более стабильное обучение

## 2. Среда CartPole

In [None]:
class SimpleCartPole:
    """Упрощённая симуляция CartPole"""
    
    def __init__(self):
        self.gravity = 9.8
        self.masscart = 1.0
        self.masspole = 0.1
        self.total_mass = self.masspole + self.masscart
        self.length = 0.5
        self.polemass_length = self.masspole * self.length
        self.force_mag = 10.0
        self.tau = 0.02
        
        self.x_threshold = 2.4
        self.theta_threshold = 12 * np.pi / 180
        
        self.state = None
        self.steps_count = 0
    
    def reset(self):
        self.state = np.random.uniform(-0.05, 0.05, size=4)
        self.steps_count = 0
        return self.state.copy()
    
    def step(self, action):
        x, x_dot, theta, theta_dot = self.state
        
        force = self.force_mag if action == 1 else -self.force_mag
        costheta = np.cos(theta)
        sintheta = np.sin(theta)
        
        temp = (force + self.polemass_length * theta_dot**2 * sintheta) / self.total_mass
        thetaacc = (self.gravity * sintheta - costheta * temp) / \
                   (self.length * (4.0/3.0 - self.masspole * costheta**2 / self.total_mass))
        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
        
        x = x + self.tau * x_dot
        x_dot = x_dot + self.tau * xacc
        theta = theta + self.tau * theta_dot
        theta_dot = theta_dot + self.tau * thetaacc
        
        self.state = np.array([x, x_dot, theta, theta_dot])
        self.steps_count += 1
        
        done = bool(
            x < -self.x_threshold or x > self.x_threshold or
            theta < -self.theta_threshold or theta > self.theta_threshold or
            self.steps_count >= 500
        )
        
        reward = 1.0 if not done else 0.0
        return self.state.copy(), reward, done

env = SimpleCartPole()
print(f'State dim: 4, Action dim: 2')

## 3. REINFORCE Algorithm

Monte Carlo policy gradient - обновляем после каждого эпизода.

In [None]:
class PolicyNetwork(nn.Module):
    """Policy Network - выдаёт распределение по действиям"""
    
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super().__init__()
        
        self.network = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, action_dim),
            nn.Softmax(dim=-1)
        )
    
    def forward(self, x):
        return self.network(x)
    
    def get_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        probs = self.forward(state)
        dist = Categorical(probs)
        action = dist.sample()
        return action.item(), dist.log_prob(action)

# Тест
policy = PolicyNetwork(4, 2)
test_state = np.random.randn(4)
action, log_prob = policy.get_action(test_state)
print(f'Action: {action}, Log prob: {log_prob.item():.4f}')

In [None]:
class REINFORCE:
    """REINFORCE algorithm"""
    
    def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99):
        self.policy = PolicyNetwork(state_dim, action_dim).to(device)
        self.optimizer = optim.Adam(self.policy.parameters(), lr=lr)
        self.gamma = gamma
        
        # Хранение эпизода
        self.log_probs = []
        self.rewards = []
    
    def select_action(self, state):
        action, log_prob = self.policy.get_action(state)
        self.log_probs.append(log_prob)
        return action
    
    def store_reward(self, reward):
        self.rewards.append(reward)
    
    def update(self):
        """Обновление после эпизода"""
        
        # Вычисляем returns (cumulative discounted rewards)
        returns = []
        G = 0
        for r in reversed(self.rewards):
            G = r + self.gamma * G
            returns.insert(0, G)
        
        returns = torch.FloatTensor(returns).to(device)
        
        # Нормализация returns (baseline trick)
        returns = (returns - returns.mean()) / (returns.std() + 1e-8)
        
        # Policy gradient loss
        policy_loss = []
        for log_prob, G in zip(self.log_probs, returns):
            policy_loss.append(-log_prob * G)
        
        loss = torch.stack(policy_loss).sum()
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # Очистка
        self.log_probs = []
        self.rewards = []
        
        return loss.item()

print('REINFORCE agent создан')

In [None]:
def train_reinforce(env, agent, episodes=500):
    """Обучение REINFORCE"""
    
    rewards_history = []
    
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = agent.select_action(state)
            next_state, reward, done = env.step(action)
            agent.store_reward(reward)
            total_reward += reward
            state = next_state
        
        agent.update()
        rewards_history.append(total_reward)
        
        if (episode + 1) % 100 == 0:
            avg_reward = np.mean(rewards_history[-100:])
            print(f'Episode {episode+1}, Avg Reward: {avg_reward:.1f}')
    
    return rewards_history

# Обучение
env = SimpleCartPole()
reinforce_agent = REINFORCE(state_dim=4, action_dim=2, lr=1e-3)

print('Обучение REINFORCE...\n')
reinforce_rewards = train_reinforce(env, reinforce_agent, episodes=500)

In [None]:
# Визуализация
plt.figure(figsize=(10, 4))
plt.plot(reinforce_rewards, alpha=0.3)
if len(reinforce_rewards) >= 50:
    smooth = np.convolve(reinforce_rewards, np.ones(50)/50, mode='valid')
    plt.plot(range(49, len(reinforce_rewards)), smooth, linewidth=2, label='Moving Avg')
plt.xlabel('Episode')
plt.ylabel('Total Reward')
plt.title('REINFORCE Training')
plt.legend()
plt.show()

print(f'Финальная средняя награда: {np.mean(reinforce_rewards[-100:]):.1f}')

## 4. Actor-Critic

Комбинируем:
- **Actor** - policy network (выбор действий)
- **Critic** - value network (оценка состояний)

Critic уменьшает variance градиента.

In [None]:
class ActorCritic(nn.Module):
    """Actor-Critic Network"""
    
    def __init__(self, state_dim, action_dim, hidden_dim=128):
        super().__init__()
        
        # Shared layers
        self.shared = nn.Sequential(
            nn.Linear(state_dim, hidden_dim),
            nn.ReLU()
        )
        
        # Actor head
        self.actor = nn.Sequential(
            nn.Linear(hidden_dim, action_dim),
            nn.Softmax(dim=-1)
        )
        
        # Critic head
        self.critic = nn.Linear(hidden_dim, 1)
    
    def forward(self, x):
        shared = self.shared(x)
        policy = self.actor(shared)
        value = self.critic(shared)
        return policy, value

# Тест
ac = ActorCritic(4, 2)
test_state = torch.randn(1, 4)
policy, value = ac(test_state)
print(f'Policy shape: {policy.shape}, Value shape: {value.shape}')

In [None]:
class A2CAgent:
    """Advantage Actor-Critic Agent"""
    
    def __init__(self, state_dim, action_dim, lr=1e-3, gamma=0.99):
        self.network = ActorCritic(state_dim, action_dim).to(device)
        self.optimizer = optim.Adam(self.network.parameters(), lr=lr)
        self.gamma = gamma
        
        # Хранение эпизода
        self.log_probs = []
        self.values = []
        self.rewards = []
        self.dones = []
    
    def select_action(self, state):
        state = torch.FloatTensor(state).unsqueeze(0).to(device)
        probs, value = self.network(state)
        
        dist = Categorical(probs)
        action = dist.sample()
        
        self.log_probs.append(dist.log_prob(action))
        self.values.append(value)
        
        return action.item()
    
    def store_outcome(self, reward, done):
        self.rewards.append(reward)
        self.dones.append(done)
    
    def update(self):
        """Обновление после эпизода"""
        
        # Вычисляем returns
        returns = []
        G = 0
        for r, done in zip(reversed(self.rewards), reversed(self.dones)):
            if done:
                G = 0
            G = r + self.gamma * G
            returns.insert(0, G)
        
        returns = torch.FloatTensor(returns).to(device)
        log_probs = torch.stack(self.log_probs)
        values = torch.cat(self.values).squeeze()
        
        # Advantage = Return - Value
        advantage = returns - values.detach()
        
        # Actor loss (policy gradient)
        actor_loss = -(log_probs * advantage).mean()
        
        # Critic loss (value estimation)
        critic_loss = F.mse_loss(values, returns)
        
        # Total loss
        loss = actor_loss + 0.5 * critic_loss
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        # Очистка
        self.log_probs = []
        self.values = []
        self.rewards = []
        self.dones = []
        
        return loss.item()

print('A2C agent создан')

In [None]:
def train_a2c(env, agent, episodes=500):
    """Обучение A2C"""
    
    rewards_history = []
    
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        
        while not done:
            action = agent.select_action(state)
            next_state, reward, done = env.step(action)
            agent.store_outcome(reward, done)
            total_reward += reward
            state = next_state
        
        agent.update()
        rewards_history.append(total_reward)
        
        if (episode + 1) % 100 == 0:
            avg_reward = np.mean(rewards_history[-100:])
            print(f'Episode {episode+1}, Avg Reward: {avg_reward:.1f}')
    
    return rewards_history

# Обучение
env = SimpleCartPole()
a2c_agent = A2CAgent(state_dim=4, action_dim=2, lr=1e-3)

print('Обучение A2C...\n')
a2c_rewards = train_a2c(env, a2c_agent, episodes=500)

## 5. Сравнение REINFORCE vs A2C

In [None]:
# Сравнение
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

# REINFORCE
axes[0].plot(reinforce_rewards, alpha=0.3, label='Episode')
if len(reinforce_rewards) >= 50:
    smooth = np.convolve(reinforce_rewards, np.ones(50)/50, mode='valid')
    axes[0].plot(range(49, len(reinforce_rewards)), smooth, linewidth=2)
axes[0].set_title('REINFORCE')
axes[0].set_xlabel('Episode')
axes[0].set_ylabel('Reward')

# A2C
axes[1].plot(a2c_rewards, alpha=0.3, label='Episode')
if len(a2c_rewards) >= 50:
    smooth = np.convolve(a2c_rewards, np.ones(50)/50, mode='valid')
    axes[1].plot(range(49, len(a2c_rewards)), smooth, linewidth=2)
axes[1].set_title('A2C (Actor-Critic)')
axes[1].set_xlabel('Episode')
axes[1].set_ylabel('Reward')

plt.tight_layout()
plt.show()

print('\nСравнение (последние 100 эпизодов):')
print(f'REINFORCE: {np.mean(reinforce_rewards[-100:]):.1f} +/- {np.std(reinforce_rewards[-100:]):.1f}')
print(f'A2C:       {np.mean(a2c_rewards[-100:]):.1f} +/- {np.std(a2c_rewards[-100:]):.1f}')

## Итоги

### Что мы изучили:

1. **Policy Gradient Theorem** - основа методов
2. **REINFORCE** - простой Monte Carlo метод
3. **Actor-Critic** - комбинация policy и value
4. **Advantage** - уменьшение variance

### Сравнение методов:

| Метод | Variance | Sample Efficiency | Complexity |
|-------|----------|-------------------|------------|
| REINFORCE | High | Low | Simple |
| A2C | Low | Medium | Medium |

### Ключевые формулы:

**Policy Gradient:**
$$\nabla_\theta J = \mathbb{E}[\nabla_\theta \log \pi_\theta(a|s) \cdot A(s,a)]$$

**Advantage:**
$$A(s,a) = Q(s,a) - V(s) \approx r + \gamma V(s') - V(s)$$

### Следующий шаг:

В ноутбуке 03 изучим PPO - современный стандарт policy gradient.