The DQN agent is implemented with both experience replay and fix target. It collects its rollout using the epsilon greedy algorithm.

Hyper-parameters:
eta = 0.99
memory maxlen = 2000
epsilon = 1
epsilon maxs = 1
epsilon mins = 0.01
epsilon decay = 0.999
optimizer: Adam(0.003)
batch size = 64
max steps = 500
total episodes in training = 1500


episodes_train = 300
batch_size = 32
ws = 20


In [None]:
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque
import random
import matplotlib.pyplot as plt
from tqdm import tqdm
import json
import pickle


env = gym.make("CartPole-v1")

def reshape(intiger):
    return  np.reshape(intiger, [1, state_size])

class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 64)
        self.fc2 = nn.Linear(64, 64)
        self.fc3 = nn.Linear(64, 64)
        self.fc4 = nn.Linear(64, action_size)

    def forward(self, x):
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        x = F.leaky_relu(self.fc3(x))
        x = self.fc4(x)
        return x

class Agent:

    def __init__(self, action_size,state_size):
        self.eta = 0.99
        self.state_size = state_size
        self.action_size = action_size
        self.DQN = QNetwork(state_size, action_size)
        self.optimizer = optim.Adam(self.DQN.parameters(), lr=0.0001)
        self.target_model_DQN = QNetwork(state_size, action_size)
        self.memory = deque(maxlen=2000)
        
        self.step = 0
        self.target_update_freq = 20

        self.epsilon = 1  # Exploration rate
        self.min_eps = 0.01
        self.eps_decay = 0.99
        self.plotter = {'num_of_eps_til_475':[] , 'loss':[] , 'tot_rawrd_per_ep':[]}


    def add_memory(self, new_state, reward, done, state, action):
        self.memory.append((new_state, reward, done, state, action))

    def update(self, batch_size):
        """update Q
        batch size is the total repaly memory samples used
        """
        minibatch = random.sample(self.memory, batch_size)
        if self.step % self.target_update_freq == 0:
            self.target_model_DQN.load_state_dict(self.DQN.state_dict())
            self.target_model_DQN.eval()

        x_train = np.zeros((batch_size, self.state_size))
        y_train = np.zeros((batch_size, self.action_size))
        i = 0
        for new_state, reward, done, state, action in minibatch:
            S = reshape(state)
            A = action
            r = reward
            S_next = reshape(new_state)
            terminated = done

            if terminated:
                y = r
            else:
                y = r + self.eta * torch.max(self.target_model_DQN(torch.tensor(S_next, dtype=torch.float32)))

            y_fit = self.DQN(torch.tensor(S, dtype=torch.float32)).detach().numpy()
            y_fit[0][A] = y

            x_train[i], y_train[i] = S, y_fit
            i = i + 1

        x_train_tensor = torch.tensor(x_train, dtype=torch.float32)
        y_train_tensor = torch.tensor(y_train, dtype=torch.float32)

        self.optimizer.zero_grad()
        output = self.DQN(x_train_tensor)
        loss = F.mse_loss(output, y_train_tensor)
        loss.backward()
        self.optimizer.step()

        # callback
        self.plotter["loss"].append(loss.item())

        if self.epsilon > self.min_eps:
            self.epsilon *= self.eps_decay
        self.step += 1

            
    def get_action(self,state):
        '''Choose A 0 from S 0 using policy derived from Q (e.g., "-greedy)'''
        z = np.random.choice(2, 1, p=[self.epsilon,1-self.epsilon])[0]
        if z == 1:
            with torch.no_grad():
                state_tensor = torch.tensor(reshape(state), dtype=torch.float32)
                action = torch.argmax(self.DQN(state_tensor)).item()
        else:
            action = np.random.randint(2)
        return action


state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = Agent(action_size, state_size)

batch_size = 32
episodes_train = 300

# training
for episode in tqdm(range(episodes_train)):

    state, info = env.reset()
    
    exp_return = 0
    while True:       

        action = agent.get_action(state)
        new_state, reward, terminated, truncated, info = env.step(action)

        agent.add_memory(new_state, reward, terminated, state, action)    
        if len(agent.memory) > batch_size:
            agent.update(batch_size)

        state = new_state
        
        exp_return = exp_return + reward

        if terminated or truncated:
            break
    
    agent.plotter['tot_rawrd_per_ep'].append(exp_return)

    fig = plt.figure()
    plt.plot(agent.plotter['tot_rawrd_per_ep'])
    plt.xlabel('episode')
    plt.title('total raward per episode')
    plt.savefig('Tot_rawrd_per_ep.png')
    plt.close()

    fig = plt.figure()
    plt.plot(agent.plotter['loss'])
    plt.xlabel('step')
    plt.title('loss')
    plt.savefig('loss.png')
    plt.close()

env.close() 


with open("result.json", "w") as fp:
    json.dump(agent.plotter, fp)

with open("model.pkl", "wb") as fp:
    pickle.dump(agent, fp)

