# Tutorial 6: RL AGENT ‚Äî Train with Tensor Experience Replay

This notebook trains a simple DQN-style agent on a toy environment, logs metrics, and plots learning curves. It stores experiences to Tensorus when available, else runs fully in demo mode.

In [None]:
# Lightweight install cell
import sys, subprocess, pkgutil
for p in ['numpy','torch','matplotlib','seaborn','requests']:
    if pkgutil.find_loader(p) is None:
        subprocess.check_call([sys.executable,'-m','pip','install',p])
print('‚úÖ Dependencies ready')

In [None]:
# Setup
import math, time, random, json, requests, numpy as np, torch
import torch.nn as nn, torch.optim as optim, torch.nn.functional as F
from collections import deque, namedtuple
import matplotlib.pyplot as plt, seaborn as sns
sns.set_theme(style='whitegrid')
API='http://127.0.0.1:7860'
def server_ok():
    try: return requests.get(f'{API}/health', timeout=2).status_code==200
    except: return False
SERVER = server_ok(); print('üì° Tensorus:', '‚úÖ Connected' if SERVER else '‚ö†Ô∏è Demo Mode')
Experience = namedtuple('Experience', 'state action reward next_state done')


## Step 1 ‚Äî Environment (toy grid)

In [None]:
class GridEnv:
    def __init__(self, size=10):
        self.size=size; self.reset()
    def reset(self):
        self.agent=[0,0]; self.goal=[self.size-1,self.size-1]; return np.array(self.agent,dtype=np.float32)
    def step(self, a):
        if a==0: self.agent[1]=min(self.size-1,self.agent[1]+1)
        elif a==1: self.agent[1]=max(0,self.agent[1]-1)
        elif a==2: self.agent[0]=max(0,self.agent[0]-1)
        elif a==3: self.agent[0]=min(self.size-1,self.agent[0]+1)
        dist=abs(self.agent[0]-self.goal[0])+abs(self.agent[1]-self.goal[1])
        if self.agent==self.goal: return np.array(self.agent,dtype=np.float32), 100.0, True, {}
        return np.array(self.agent,dtype=np.float32), -0.1 - 0.01*dist, False, {}


## Step 2 ‚Äî Replay Buffer with Tensorus fallback

In [None]:
class Replay:
    def __init__(self, cap=50000):
        self.cap=cap; self.buf=deque(maxlen=cap); self.pri=deque(maxlen=cap)
    def store(self, exp:Experience, p:float=1.0):
        self.buf.append(exp); self.pri.append(max(1e-6,p))
    def sample(self, n=64):
        if len(self.buf)<n: return list(self.buf)
        probs = np.array(self.pri)/sum(self.pri)
        idx = np.random.choice(len(self.buf), n, p=probs)
        return [self.buf[i] for i in idx]
    def __len__(self): return len(self.buf)

replay = Replay(100000)


## Step 3 ‚Äî DQN Network and Agent

In [None]:
class QNet(nn.Module):
    def __init__(self, sdim=2, adim=4, h=(256,256)):
        super().__init__(); layers=[]; inp=sdim
        for u in h:
            layers += [nn.Linear(inp,u), nn.ReLU(), nn.Dropout(0.1)]
            inp=u
        layers += [nn.Linear(inp, adim)]
        self.net = nn.Sequential(*layers)
        self.apply(self._init)
    def _init(self,m):
        if isinstance(m, nn.Linear): nn.init.xavier_uniform_(m.weight); nn.init.constant_(m.bias, 0.01)
    def forward(self,x): return self.net(x)

class Agent:
    def __init__(self, sdim=2, adim=4):
        self.q = QNet(sdim, adim); self.tgt = QNet(sdim, adim); self.tgt.load_state_dict(self.q.state_dict())
        self.opt = optim.Adam(self.q.parameters(), lr=1e-3)
        self.gamma=0.99; self.eps=0.2; self.eps_min=0.01; self.eps_decay=0.995; self.adim=adim; self.step=0
    def act(self, s, train=True):
        if train and random.random()<self.eps: return random.randrange(self.adim)
        with torch.no_grad():
            q=self.q(torch.FloatTensor(s).unsqueeze(0)); return int(q.argmax().item())
    def learn(self, batch):
        if not batch: return {'loss':0.0,'q_mean':0.0}
        S=torch.FloatTensor([e.state for e in batch]); A=torch.LongTensor([e.action for e in batch])
        R=torch.FloatTensor([e.reward for e in batch]); NS=torch.FloatTensor([e.next_state for e in batch])
        D=torch.BoolTensor([e.done for e in batch])
        q = self.q(S).gather(1, A.unsqueeze(1)).squeeze(1)
        with torch.no_grad(): tgt = R + self.gamma * self.tgt(NS).max(1)[0] * (~D)
        loss = F.mse_loss(q, tgt)
        self.opt.zero_grad(); loss.backward(); nn.utils.clip_grad_norm_(self.q.parameters(),1.0); self.opt.step()
        if self.step % 100 == 0: self.tgt.load_state_dict(self.q.state_dict())
        self.eps = max(self.eps_min, self.eps*self.eps_decay); self.step+=1
        return {'loss': float(loss.item()), 'q_mean': float(q.mean().item())}

agent = Agent()


## Step 4 ‚Äî Training Loop (100‚Äì200 episodes)

In [None]:
env=GridEnv(10)
episodes=120; max_steps=300
reward_hist=[]; loss_hist=[]
for ep in range(episodes):
    s=env.reset(); tot=0; losses=[]
    for t in range(max_steps):
        a=agent.act(s, train=True)
        ns, r, done, _ = env.step(a)
        exp=Experience(s,a,r,ns,done)
        # store (local; can be integrated with Tensorus if desired)
        replay.store(exp, p=1.0)
        s=ns; tot+=r
        # learn
        batch=replay.sample(64)
        metrics=agent.learn(batch)
        if metrics['loss']>0: losses.append(metrics['loss'])
        if done: break
    reward_hist.append(tot); loss_hist.append(np.mean(losses) if losses else 0.0)
    if (ep+1)%10==0: print(f'Episode {ep+1:3d}  reward={tot:.2f}  eps={agent.eps:.3f}')
# Plots
import matplotlib.pyplot as plt
plt.figure(figsize=(10,4)); plt.subplot(1,2,1); plt.plot(reward_hist); plt.title('Reward per Episode');
plt.subplot(1,2,2); plt.plot(loss_hist); plt.title('Loss per Episode'); plt.tight_layout(); plt.show()


## Step 5 ‚Äî Evaluation Episode

In [None]:
s=env.reset(); tot=0; traj=[tuple(s)]
for t in range(200):
    a=agent.act(s, train=False)
    s,r,d,_=env.step(a); tot+=r; traj.append(tuple(s))
    if d: break
print('Eval total reward:', round(tot,2), '| steps:', len(traj)-1)
# quick trajectory plot
xs=[p[0] for p in traj]; ys=[p[1] for p in traj]
plt.figure(); plt.plot(xs, ys, '-o'); plt.title('Evaluation Trajectory'); plt.gca().invert_yaxis(); plt.show()