In [1]:
!pip install tensorboard
!pip install gymnasium torch numpy pandas matplotlib

import os, json, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import gymnasium as gym
from gymnasium import spaces

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device =", device)

Collecting tensorboard
  Using cached tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting absl-py>=0.4 (from tensorboard)
  Using cached absl_py-2.3.1-py3-none-any.whl.metadata (3.3 kB)
Collecting grpcio>=1.48.2 (from tensorboard)
  Downloading grpcio-1.76.0-cp313-cp313-win_amd64.whl.metadata (3.8 kB)
Collecting markdown>=2.6.8 (from tensorboard)
  Using cached markdown-3.10-py3-none-any.whl.metadata (5.1 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard)
  Using cached tensorboard_data_server-0.7.2-py3-none-any.whl.metadata (1.1 kB)
Collecting werkzeug>=1.0.1 (from tensorboard)
  Downloading werkzeug-3.1.5-py3-none-any.whl.metadata (4.0 kB)
Using cached tensorboard-2.20.0-py3-none-any.whl (5.5 MB)
Using cached tensorboard_data_server-0.7.2-py3-none-any.whl (2.4 kB)
Using cached absl_py-2.3.1-py3-none-any.whl (135 kB)
Downloading grpcio-1.76.0-cp313-cp313-win_amd64.whl (4.7 MB)
   ---------------------------------------- 0.0/4.7 MB ? eta -:--:--
   ---


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\82108\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: C:\Users\82108\AppData\Local\Programs\Python\Python313\python.exe -m pip install --upgrade pip


device = cpu


In [2]:
from torch.utils.tensorboard import SummaryWriter
import numpy as np
from itertools import count

import gymnasium as gym

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Bernoulli

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


class PolicyNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(2, 64)
        self.lstm = nn.LSTM(64, 128, batch_first=True)
        self.fc2 = nn.Linear(128, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, hidden):
        # x: (B, T, 2)
        x = self.relu(self.fc1(x))
        x, hidden = self.lstm(x, hidden)
        x = self.relu(x)
        x = self.sigmoid(self.fc2(x))  # (B, T, 1)
        return x, hidden

    def select_action(self, state, hidden):
        # state: (1, 1, 2)
        with torch.no_grad():
            prob, hidden = self.forward(state, hidden)  # (1,1,1)
            b = Bernoulli(prob)
            action = b.sample()  # 0/1
        return int(action.item()), hidden


class ValueNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(2, 64)
        self.lstm = nn.LSTM(64, 256, batch_first=True)
        self.fc2 = nn.Linear(256, 1)
        self.relu = nn.ReLU()

    def forward(self, x, hidden):
        x = self.relu(self.fc1(x))
        x, hidden = self.lstm(x, hidden)
        x = self.relu(x)
        x = self.fc2(x)  # (B,T,1)
        return x, hidden


def obs_to_partial(obs):
    # CartPole obs: [x, x_dot, theta, theta_dot]
    # keep only x, theta
    return np.array([obs[0], obs[2]], dtype=np.float32)


if __name__ == "__main__":
    env = gym.make("CartPole-v1")
    policy = PolicyNetwork().to(device)
    value = ValueNetwork().to(device)

    optim = torch.optim.Adam(policy.parameters(), lr=1e-4)
    value_optim = torch.optim.Adam(value.parameters(), lr=3e-4)

    gamma = 0.99
    writer = SummaryWriter("./lstm_logs")

    for epoch in count():
        obs, info = env.reset(seed=None)
        state = obs_to_partial(obs)
        episode_reward = 0.0

        # LSTM hidden init
        a_hx = torch.zeros((1, 1, 128), device=device)
        a_cx = torch.zeros((1, 1, 128), device=device)

        rewards = []
        actions = []
        states = []

        for t in range(500):  # CartPole-v1 max is typically 500
            states.append(state.copy())

            state_t = torch.tensor(state, dtype=torch.float32, device=device).view(1, 1, 2)
            action, (a_hx, a_cx) = policy.select_action(state_t, (a_hx, a_cx))
            actions.append(action)

            next_obs, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated

            next_state = obs_to_partial(next_obs)
            episode_reward += float(reward)

            rewards.append(float(reward))
            state = next_state

            if done:
                break

        # returns
        returns = np.zeros(len(rewards), dtype=np.float32)
        R = 0.0
        for i in reversed(range(len(rewards))):
            R = gamma * R + rewards[i]
            returns[i] = R

        # normalize returns (stability)
        mean, std = returns.mean(), returns.std()
        std = std if std > 1e-8 else 1.0
        returns = (returns - mean) / std

        # tensors
        states_tensor = torch.tensor(np.array(states), dtype=torch.float32, device=device).unsqueeze(0)  # (1,T,2)
        actions_tensor = torch.tensor(np.array(actions), dtype=torch.float32, device=device).view(-1, 1)  # (T,1)
        returns_tensor = torch.tensor(returns, dtype=torch.float32, device=device).view(-1, 1)  # (T,1)

        # critic to get baseline
        with torch.no_grad():
            c_hx = torch.zeros((1, 1, 256), device=device)
            c_cx = torch.zeros((1, 1, 256), device=device)
            v, _ = value(states_tensor, (c_hx, c_cx))  # (1,T,1)
            v = v.squeeze(0)  # (T,1)
            advantage = returns_tensor - v  # (T,1)

        # actor update (re-run policy on full sequence)
        a_hx = torch.zeros((1, 1, 128), device=device)
        a_cx = torch.zeros((1, 1, 128), device=device)
        prob, _ = policy(states_tensor, (a_hx, a_cx))  # (1,T,1)
        prob = prob.squeeze(0)  # (T,1)

        b = Bernoulli(prob)
        log_prob = b.log_prob(actions_tensor)  # (T,1)

        actor_loss = -(log_prob * advantage.detach()).mean()

        optim.zero_grad()
        actor_loss.backward()
        torch.nn.utils.clip_grad_norm_(policy.parameters(), 1.0)
        optim.step()
        writer.add_scalar("loss/actor", actor_loss.item(), epoch)

        # critic update
        c_hx = torch.zeros((1, 1, 256), device=device)
        c_cx = torch.zeros((1, 1, 256), device=device)
        v, _ = value(states_tensor, (c_hx, c_cx))
        v = v.squeeze(0)
        value_loss = F.mse_loss(v, returns_tensor)

        value_optim.zero_grad()
        value_loss.backward()
        torch.nn.utils.clip_grad_norm_(value.parameters(), 1.0)
        value_optim.step()
        writer.add_scalar("loss/value", value_loss.item(), epoch)

        writer.add_scalar("episode_reward", episode_reward, epoch)

        if epoch % 10 == 0:
            print(f"Epoch {epoch:05d} | ep_reward {episode_reward:.1f} | T={len(rewards)}")
            torch.save(policy.state_dict(), "lstm-policy.pt")


      

Epoch 00000 | ep_reward 22.0 | T=22
Epoch 00010 | ep_reward 16.0 | T=16
Epoch 00020 | ep_reward 50.0 | T=50
Epoch 00030 | ep_reward 12.0 | T=12
Epoch 00040 | ep_reward 20.0 | T=20
Epoch 00050 | ep_reward 18.0 | T=18
Epoch 00060 | ep_reward 39.0 | T=39
Epoch 00070 | ep_reward 20.0 | T=20
Epoch 00080 | ep_reward 13.0 | T=13
Epoch 00090 | ep_reward 25.0 | T=25
Epoch 00100 | ep_reward 12.0 | T=12
Epoch 00110 | ep_reward 32.0 | T=32
Epoch 00120 | ep_reward 15.0 | T=15
Epoch 00130 | ep_reward 13.0 | T=13
Epoch 00140 | ep_reward 49.0 | T=49
Epoch 00150 | ep_reward 15.0 | T=15
Epoch 00160 | ep_reward 25.0 | T=25
Epoch 00170 | ep_reward 28.0 | T=28
Epoch 00180 | ep_reward 23.0 | T=23
Epoch 00190 | ep_reward 14.0 | T=14
Epoch 00200 | ep_reward 38.0 | T=38
Epoch 00210 | ep_reward 24.0 | T=24
Epoch 00220 | ep_reward 19.0 | T=19
Epoch 00230 | ep_reward 21.0 | T=21
Epoch 00240 | ep_reward 34.0 | T=34
Epoch 00250 | ep_reward 29.0 | T=29
Epoch 00260 | ep_reward 21.0 | T=21
Epoch 00270 | ep_reward 15.0

KeyboardInterrupt: 

In [None]:
# =========================
# Visualization (paper-ready)
# =========================
# This cell reads TensorBoard event files written by SummaryWriter("./lstm_logs")
# and produces publication-friendly learning curves:
# - mean ± std over seeds (if multiple runs exist)
# - moving-average smoothing
# - ablation comparison across multiple experiment folders
#
# Usage:
# 1) Run training one or more times (ideally with different seeds & different log dirs).
# 2) Set LOGROOTS below to point at your log folders.
# 3) Run this cell.

import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tensorboard.backend.event_processing.event_accumulator import EventAccumulator


def _find_event_files(logdir: str):
    """Return a list of TensorBoard event files under logdir (recursive)."""
    patterns = [
        os.path.join(logdir, "**", "events.out.tfevents.*"),
        os.path.join(logdir, "events.out.tfevents.*"),
    ]
    files = []
    for p in patterns:
        files.extend(glob.glob(p, recursive=True))
    return sorted(list(set(files)))


def _load_scalars_from_event(event_file: str, tags=None):
    """Load scalar time-series from one event file into a tidy DataFrame."""
    ea = EventAccumulator(event_file, size_guidance={"scalars": 0})
    ea.Reload()

    available = set(ea.Tags().get("scalars", []))
    if tags is None:
        tags = sorted(list(available))
    else:
        tags = [t for t in tags if t in available]

    rows = []
    for tag in tags:
        for ev in ea.Scalars(tag):
            rows.append({
                "tag": tag,
                "step": int(ev.step),
                "value": float(ev.value),
            })
    df = pd.DataFrame(rows)
    return df


def load_tb_runs(logroot: str, tags=None):
    """
    Read all event files under logroot.
    Each event file is treated as one 'run' (seed/trial).
    Returns a tidy DataFrame with columns: [group, run, tag, step, value].
    """
    event_files = _find_event_files(logroot)
    if len(event_files) == 0:
        raise FileNotFoundError(
            f"No TensorBoard event files found under: {logroot}\n"
            f"Expected something like: {logroot}/events.out.tfevents.*"
        )

    dfs = []
    for i, ef in enumerate(event_files):
        run_name = os.path.relpath(os.path.dirname(ef), logroot)
        # if event file is directly under logroot, relpath becomes "."
        if run_name == ".":
            run_name = f"run{i:02d}"
        df = _load_scalars_from_event(ef, tags=tags)
        if len(df) == 0:
            continue
        df["group"] = os.path.basename(os.path.normpath(logroot))
        df["run"] = run_name
        df["event_file"] = os.path.basename(ef)
        dfs.append(df)

    if len(dfs) == 0:
        raise RuntimeError(f"Found event files under {logroot}, but no scalar tags matched.")
    return pd.concat(dfs, ignore_index=True)


def moving_average(x, window=20):
    if window is None or window <= 1:
        return x
    x = np.asarray(x, dtype=float)
    if len(x) < window:
        return x
    w = np.ones(window, dtype=float) / float(window)
    return np.convolve(x, w, mode="valid")


def summarize_runs(df: pd.DataFrame, tag: str, window=20):
    """
    Build per-step mean/std across runs for a given tag.
    We first align by 'step' (outer join), then compute mean/std.
    Optionally apply moving-average smoothing to each run before aggregation.
    """
    d = df[df["tag"] == tag].copy()
    if d.empty:
        return None

    # Pivot to [step x run] matrix for one group at a time later.
    return d


def plot_tag_across_groups(df_all: pd.DataFrame, tag: str, window=20, xlabel="Epoch", ylabel=None, title=None):
    """
    Plot mean ± std across runs for each group (experiment folder) for the given tag.
    """
    if ylabel is None:
        ylabel = tag

    plt.figure(figsize=(7, 4.2))
    any_plotted = False

    for group, dg in df_all.groupby("group"):
        d = dg[dg["tag"] == tag].copy()
        if d.empty:
            continue

        # Per-run smoothing, then align on step
        series = {}
        for run, dr in d.groupby("run"):
            dr = dr.sort_values("step")
            y = dr["value"].to_numpy()
            x = dr["step"].to_numpy()

            if window and window > 1 and len(y) >= window:
                y_s = moving_average(y, window=window)
                x_s = x[window-1:]  # align with 'valid' conv
            else:
                y_s = y
                x_s = x

            series[run] = pd.Series(y_s, index=x_s)

        mat = pd.concat(series, axis=1)  # index=step, columns=run
        mean = mat.mean(axis=1, skipna=True)
        std = mat.std(axis=1, skipna=True)

        plt.plot(mean.index.to_numpy(), mean.to_numpy(), label=f"{group} (n={mat.shape[1]})")
        plt.fill_between(mean.index.to_numpy(),
                         (mean - std).to_numpy(),
                         (mean + std).to_numpy(),
                         alpha=0.2)
        any_plotted = True

    if not any_plotted:
        print(f"[warn] Tag not found in any group: {tag}")
        return

    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    if title:
        plt.title(title)
    plt.grid(True, alpha=0.3)
    plt.legend()
    plt.tight_layout()
    plt.show()


# -------------------------
# Configure your log folders
# -------------------------
# Put each experiment (baseline / ablation variants) into a separate folder.
# Example layout:
#   ./logs_baseline/seed0/events.out.tfevents...
#   ./logs_baseline/seed1/events.out.tfevents...
#   ./logs_modelA/seed0/events.out.tfevents...
#   ./logs_modelA/seed1/events.out.tfevents...
#
# If you only have one folder (the current code writes to ./lstm_logs),
# just keep baseline only.

LOGROOTS = {
    "baseline": "./lstm_logs",
    # "modelA": "./logs_modelA",
    # "modelB": "./logs_modelB",
    # "modelC": "./logs_modelC",
}

# Tags we care about (only those present will be loaded)
TAGS = [
    "episode_reward",
    "loss/policy",
    "loss/value",
    "loss/total",
    "success",
    "succ",
    "episode_length",
]

# -------------------------
# Load & plot
# -------------------------
dfs = []
for name, root in LOGROOTS.items():
    if not os.path.exists(root):
        print(f"[skip] missing log dir: {root}")
        continue
    d = load_tb_runs(root, tags=TAGS)
    # Override group label with friendly name (instead of folder basename)
    d["group"] = name
    dfs.append(d)

if len(dfs) == 0:
    raise RuntimeError("No logs loaded. Check LOGROOTS paths above.")

df_all = pd.concat(dfs, ignore_index=True)

print("Loaded groups:", df_all["group"].unique().tolist())
print("Available tags:", sorted(df_all["tag"].unique().tolist()))

# Paper-friendly smoothing window (tune as needed)
SMOOTH = 20

# 1) Learning curve (Reward)
plot_tag_across_groups(
    df_all, "episode_reward",
    window=SMOOTH,
    xlabel="Epoch",
    ylabel="Episode return",
    title="Learning curve (mean ± std across seeds)"
)

# 2) Success rate, if logged
# (Your current training code does not log success. If you add writer.add_scalar('success', ...),
# this plot will automatically appear.)
for succ_tag in ["success", "succ"]:
    if succ_tag in set(df_all["tag"].unique()):
        plot_tag_across_groups(
            df_all, succ_tag,
            window=SMOOTH,
            xlabel="Epoch",
            ylabel="Success rate",
            title="Success rate (mean ± std across seeds)"
        )

# 3) Loss curves
for t in ["loss/policy", "loss/value", "loss/total"]:
    if t in set(df_all["tag"].unique()):
        plot_tag_across_groups(
            df_all, t,
            window=SMOOTH,
            xlabel="Epoch",
            ylabel=t,
            title=f"{t} (mean ± std across seeds)"
        )

# -------------------------
# Optional: Save figures
# -------------------------
# Uncomment to export figures as .png (good for paper drafts).
# os.makedirs("figs", exist_ok=True)
# for tag, fname, ylabel in [
#     ("episode_reward", "learning_curve_reward.png", "Episode return"),
#     ("loss/policy", "loss_policy.png", "loss/policy"),
#     ("loss/value", "loss_value.png", "loss/value"),
# ]:
#     if tag not in set(df_all["tag"].unique()):
#         continue
#     plt.figure(figsize=(7,4.2))
#     for group, dg in df_all.groupby("group"):
#         d = dg[dg["tag"] == tag].copy()
#         if d.empty:
#             continue
#         series = {}
#         for run, dr in d.groupby("run"):
#             dr = dr.sort_values("step")
#             y = dr["value"].to_numpy()
#             x = dr["step"].to_numpy()
#             if SMOOTH and SMOOTH > 1 and len(y) >= SMOOTH:
#                 y_s = moving_average(y, window=SMOOTH)
#                 x_s = x[SMOOTH-1:]
#             else:
#                 y_s, x_s = y, x
#             series[run] = pd.Series(y_s, index=x_s)
#         mat = pd.concat(series, axis=1)
#         mean = mat.mean(axis=1, skipna=True)
#         std = mat.std(axis=1, skipna=True)
#         plt.plot(mean.index.to_numpy(), mean.to_numpy(), label=f"{group} (n={mat.shape[1]})")
#         plt.fill_between(mean.index.to_numpy(),
#                          (mean-std).to_numpy(),
#                          (mean+std).to_numpy(),
#                          alpha=0.2)
#     plt.xlabel("Epoch"); plt.ylabel(ylabel); plt.grid(True, alpha=0.3); plt.legend(); plt.tight_layout()
#     plt.savefig(os.path.join("figs", fname), dpi=300)
#     plt.close()
# print("Saved figures to ./figs")
