# Tutorial 6 (Enhanced): RL AGENT — DQN with Prioritized Replay

We build a toy RL environment and train a DQN agent with Prioritized Experience Replay (simulated). We plot rewards/loss and run evaluation.

In [None]:
# Install
import sys, subprocess, pkgutil
for p in ['numpy','torch','matplotlib','seaborn','requests']:
    if pkgutil.find_loader(p) is None: subprocess.check_call([sys.executable,'-m','pip','install',p])
print('✅ Dependencies ready')

In [None]:
# Setup
import math, time, random, numpy as np, torch, requests
import torch.nn as nn, torch.optim as optim, torch.nn.functional as F
from collections import deque, namedtuple
import matplotlib.pyplot as plt, seaborn as sns
sns.set_theme(style='whitegrid')
API='http://127.0.0.1:7860'
def server_ok():
    try: return requests.get(f'{API}/health', timeout=2).status_code==200
    except: return False
SERVER=server_ok(); print('📡 Tensorus:', '✅ Connected' if SERVER else '⚠️ Demo Mode')
Experience = namedtuple('Experience','state action reward next_state done')

## Environment

In [None]:
class GridEnv:
    def __init__(self, size=10): self.size=size; self.reset()
    def reset(self): self.agent=[0,0]; self.goal=[self.size-1,self.size-1]; return np.array(self.agent,dtype=np.float32)
    def step(self,a):
        if a==0: self.agent[1]=min(self.size-1,self.agent[1]+1)
        elif a==1: self.agent[1]=max(0,self.agent[1]-1)
        elif a==2: self.agent[0]=max(0,self.agent[0]-1)
        elif a==3: self.agent[0]=min(self.size-1,self.agent[0]+1)
        dist=abs(self.agent[0]-self.goal[0])+abs(self.agent[1]-self.goal[1])
        if self.agent==self.goal: return np.array(self.agent,dtype=np.float32),100.0,True,{}
        return np.array(self.agent,dtype=np.float32), -0.1-0.01*dist, False, {}


## Prioritized Replay (simulated) and DQN

In [None]:
class Replay:
    def __init__(self, cap=50000): self.cap=cap; self.buf=deque(maxlen=cap); self.pri=deque(maxlen=cap)
    def store(self, e:Experience, p=1.0): self.buf.append(e); self.pri.append(max(1e-6,p))
    def sample(self, n=64):
        n=min(n,len(self.buf)); probs=np.array(self.pri)/sum(self.pri); idx=np.random.choice(len(self.buf), n, p=probs); return [self.buf[i] for i in idx]
class QNet(nn.Module):
    def __init__(self, sdim=2, adim=4):
        super().__init__(); self.net=nn.Sequential(nn.Linear(2,128),nn.ReLU(),nn.Linear(128,128),nn.ReLU(),nn.Linear(128,adim))
    def forward(self,x): return self.net(x)
class Agent:
    def __init__(self): self.q=QNet(); self.tgt=QNet(); self.tgt.load_state_dict(self.q.state_dict()); self.opt=optim.Adam(self.q.parameters(), lr=1e-3); self.gamma=0.99; self.eps=0.2; self.eps_min=0.01; self.eps_decay=0.995; self.step=0
    def act(self,s,train=True):
        if train and random.random()<self.eps: return random.randrange(4)
        with torch.no_grad(): return int(self.q(torch.FloatTensor(s).unsqueeze(0)).argmax().item())
    def learn(self,batch):
        if not batch: return {'loss':0.0}
        S=torch.FloatTensor([e.state for e in batch]); A=torch.LongTensor([e.action for e in batch]); R=torch.FloatTensor([e.reward for e in batch]); NS=torch.FloatTensor([e.next_state for e in batch]); D=torch.BoolTensor([e.done for e in batch])
        q=self.q(S).gather(1,A.unsqueeze(1)).squeeze(1)
        with torch.no_grad(): tgt=R + self.gamma*self.tgt(NS).max(1)[0]*(~D)
        loss=F.mse_loss(q,tgt); self.opt.zero_grad(); loss.backward(); nn.utils.clip_grad_norm_(self.q.parameters(),1.0); self.opt.step()
        if self.step%100==0: self.tgt.load_state_dict(self.q.state_dict())
        self.eps=max(self.eps_min,self.eps*self.eps_decay); self.step+=1; return {'loss':float(loss.item())}


## Training and Evaluation

In [None]:
env=GridEnv(10); agent=Agent(); replay=Replay(100000)
rewards=[]; losses=[]
for ep in range(120):
    s=env.reset(); tot=0; ltmp=[]
    for t in range(300):
        a=agent.act(s,train=True); ns,r,d,_=env.step(a); replay.store(Experience(s,a,r,ns,d), p=1.0+max(0,r))
        s=ns; tot+=r; m=agent.learn(replay.sample(64));
        if m['loss']>0: ltmp.append(m['loss']);
        if d: break
    rewards.append(tot); losses.append(np.mean(ltmp) if ltmp else 0.0)
plt.figure(figsize=(10,4)); plt.subplot(1,2,1); plt.plot(rewards); plt.title('Rewards'); plt.subplot(1,2,2); plt.plot(losses); plt.title('Loss'); plt.tight_layout(); plt.show()
# Evaluation
s=env.reset(); tot=0; traj=[tuple(s)]
for _ in range(200):
    a=agent.act(s,train=False); s,r,d,_=env.step(a); tot+=r; traj.append(tuple(s));
    if d: break
print('Eval total reward:', round(tot,2),'| steps:', len(traj)-1)