<a href="https://colab.research.google.com/github/velpulakaran/reinforcement-learning/blob/main/RLML_Lab_06.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#2303A51587

V.karan

Batch-09


Implementing a Policy Gradient algorithm -e.g., REINFORCE for a simple task.

Code 1 – Vanilla REINFORCE

In [None]:
import numpy as np

class ToyEnv:
    def __init__(self, target=10, max_steps=20, seed=0):
        self.target = target
        self.max_steps = max_steps
        self.rng = np.random.RandomState(seed)
        self.reset()
    def reset(self):
        self.pos = 0
        self.steps = 0
        return np.array([self.pos], dtype=np.float32)
    def step(self, action):
        self.pos += (1 if action==1 else -1)
        self.steps += 1
        done = False
        reward = 0.0
        if self.pos >= self.target:
            done = True
            reward = 1.0
        elif self.steps >= self.max_steps:
            done = True
        return np.array([self.pos], dtype=np.float32), reward, done, {}

class SoftmaxPolicy:
    def __init__(self, state_dim, n_actions, lr=1e-2, seed=1):
        self.state_dim = state_dim
        self.n_actions = n_actions
        self.lr = lr
        self.rng = np.random.RandomState(seed)
        self.W = 0.01 * self.rng.randn(n_actions, state_dim+1)
    def _featurize(self, s):
        return np.concatenate([s, [1.0]])
    def action_probs(self, s):
        x = self._featurize(s)
        logits = self.W.dot(x)
        logits = logits - np.max(logits)
        exp = np.exp(logits)
        return exp / np.sum(exp)
    def sample(self, s):
        p = self.action_probs(s)
        return self.rng.choice(self.n_actions, p=p), p
    def update_episode(self, states, actions, returns):
        for s, a, G in zip(states, actions, returns):
            x = self._featurize(s)
            probs = self.action_probs(s)
            grad_log = -probs[:,None] * x[None,:]
            grad_log[a] += x
            self.W += self.lr * G * grad_log

def discount_rewards(rewards, gamma):
    Gs = np.zeros_like(rewards, dtype=np.float32)
    G = 0.0
    for t in reversed(range(len(rewards))):
        G = rewards[t] + gamma * G
        Gs[t] = G
    return Gs

def run_vanilla_reinforce(seed=0):
    env = ToyEnv(seed=seed)
    policy = SoftmaxPolicy(state_dim=1, n_actions=2, lr=0.01, seed=seed+1)
    n_episodes = 2000
    gamma = 0.99
    rewards_history = []
    for ep in range(1, n_episodes+1):
        s = env.reset()
        states, actions, rewards = [], [], []
        done = False
        while not done:
            a, _ = policy.sample(s)
            a_mapped = 1 if a==1 else -1
            ns, r, done, _ = env.step(a_mapped)
            states.append(s.copy())
            actions.append(a)
            rewards.append(r)
            s = ns
        returns = discount_rewards(rewards, gamma)
        policy.update_episode(states, actions, returns)
        ep_reward = sum(rewards)
        rewards_history.append(ep_reward)
        if ep % 200 == 0:
            avg = np.mean(rewards_history[-200:])
            print(f"Episode {ep:4d}  Average reward (last 200): {avg:.3f}")
    return rewards_history, policy

if __name__ == "__main__":
    rewards, policy = run_vanilla_reinforce(seed=42)
    print("Done. Last 20 episode rewards:", rewards[-20:])


Episode  200  Average reward (last 200): 0.460
Episode  400  Average reward (last 200): 0.955
Episode  600  Average reward (last 200): 0.990
Episode  800  Average reward (last 200): 0.990
Episode 1000  Average reward (last 200): 1.000
Episode 1200  Average reward (last 200): 1.000
Episode 1400  Average reward (last 200): 0.995
Episode 1600  Average reward (last 200): 1.000
Episode 1800  Average reward (last 200): 1.000
Episode 2000  Average reward (last 200): 1.000
Done. Last 20 episode rewards: [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]


Code 2 – REINFORCE with Linear Baseline

In [None]:
import numpy as np

class ToyEnv:
    def __init__(self, target=10, max_steps=20, seed=0):
        self.target = target
        self.max_steps = max_steps
        self.rng = np.random.RandomState(seed)
        self.reset()
    def reset(self):
        self.pos = 0
        self.steps = 0
        return np.array([self.pos], dtype=np.float32)
    def step(self, action):
        self.pos += (1 if action==1 else -1)
        self.steps += 1
        done = False
        reward = 0.0
        if self.pos >= self.target:
            done = True
            reward = 1.0
        elif self.steps >= self.max_steps:
            done = True
        return np.array([self.pos], dtype=np.float32), reward, done, {}

class SoftmaxPolicy:
    def __init__(self, state_dim, n_actions, lr=5e-3, seed=1):
        self.state_dim = state_dim
        self.n_actions = n_actions
        self.lr = lr
        self.rng = np.random.RandomState(seed)
        self.W = 0.01 * self.rng.randn(n_actions, state_dim+1)
    def _featurize(self, s):
        return np.concatenate([s, [1.0]])
    def action_probs(self, s):
        x = self._featurize(s)
        logits = self.W.dot(x)
        logits = logits - np.max(logits)
        exp = np.exp(logits)
        return exp / np.sum(exp)
    def sample(self, s):
        p = self.action_probs(s)
        return self.rng.choice(self.n_actions, p=p), p
    def update(self, grad_log_sums):
        self.W += self.lr * grad_log_sums

class LinearBaseline:
    def __init__(self, state_dim, lr=1e-2, seed=2):
        self.state_dim = state_dim
        self.lr = lr
        self.rng = np.random.RandomState(seed)
        self.w = 0.01 * self.rng.randn(state_dim+1)
    def featurize(self, s):
        return np.concatenate([s, [1.0]])
    def predict(self, s):
        return float(self.w.dot(self.featurize(s)))
    def update(self, states, returns):
        for s, G in zip(states, returns):
            x = self.featurize(s)
            pred = self.w.dot(x)
            grad = (G - pred) * x
            self.w += self.lr * grad

def discount_rewards(rewards, gamma):
    Gs = np.zeros_like(rewards, dtype=np.float32)
    G = 0.0
    for t in reversed(range(len(rewards))):
        G = rewards[t] + gamma * G
        Gs[t] = G
    return Gs

def run_reinforce_with_baseline(seed=0):
    env = ToyEnv(seed=seed)
    policy = SoftmaxPolicy(state_dim=1, n_actions=2, lr=0.005, seed=seed+1)
    baseline = LinearBaseline(state_dim=1, lr=0.05, seed=seed+2)
    n_episodes = 2000
    gamma = 0.99
    rewards_history = []
    for ep in range(1, n_episodes+1):
        s = env.reset()
        states, actions, rewards = [], [], []
        done = False
        while not done:
            a, _ = policy.sample(s)
            a_mapped = 1 if a==1 else -1
            ns, r, done, _ = env.step(a_mapped)
            states.append(s.copy())
            actions.append(a)
            rewards.append(r)
            s = ns
        returns = discount_rewards(rewards, gamma)
        adv = np.array([returns[i] - baseline.predict(states[i]) for i in range(len(states))], dtype=np.float32)
        grad_sum = np.zeros_like(policy.W)
        for s0, a0, A in zip(states, actions, adv):
            x = policy._featurize(s0)
            probs = policy.action_probs(s0)
            grad_log = -probs[:,None] * x[None,:]
            grad_log[a0] += x
            grad_sum += A * grad_log
        policy.update(grad_sum)
        baseline.update(states, returns)
        ep_reward = sum(rewards)
        rewards_history.append(ep_reward)
        if ep % 200 == 0:
            avg = np.mean(rewards_history[-200:])
            print(f"Episode {ep:4d}  Average reward (last 200): {avg:.3f}")
    return rewards_history, policy, baseline

if __name__ == "__main__":
    rewards_b, policy_b, baseline = run_reinforce_with_baseline(seed=123)
    print("Done. Last 20 episode rewards:", rewards_b[-20:])


Episode  200  Average reward (last 200): 0.010
Episode  400  Average reward (last 200): 0.000
Episode  600  Average reward (last 200): 0.000
Episode  800  Average reward (last 200): 0.000
Episode 1000  Average reward (last 200): 0.000
Episode 1200  Average reward (last 200): 0.000
Episode 1400  Average reward (last 200): 0.000
Episode 1600  Average reward (last 200): 0.000
Episode 1800  Average reward (last 200): 0.000
Episode 2000  Average reward (last 200): 0.000
Done. Last 20 episode rewards: [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
