In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import ControlSystem
from QNetwork import *
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model_name = "qnetwork_soft"

episodes = 1300
samples_per_episode = 200
targets_per_episode = 4
batch_size = 16
batch_per_episode = 100

agent = QLearningAgentSoft(action_search_batch=32,
                                 gamma=0.988,
                                 temperature=0.016,
                                 average_weight=0.5,
                                 learning_rate=0.00003 * batch_size, warmup_steps=1000,
                                 learning_rate_decay=0.99998)

for e in range(episodes):
    # Reset the environment (PT1 system)
    pt1_with_delay = ControlSystem.PT1(K=2, T=5, delta_t=0.1, delay=2)
    total_reward = 0
    state = torch.tensor([pt1_with_delay.y_prev], device=device)

    for j in range(targets_per_episode):
        target = torch.tensor([random.uniform(0, 2)], device=device)
        for k in range(samples_per_episode):
            control_signal, u, s = agent.act(state, target)
            next_state = pt1_with_delay.calculate(control_signal)

            # Calculate reward (negative of the absolute error)
            reward = -torch.abs(next_state - target)
            total_reward += reward

            # Store the experience
            agent.remember(reward, next_state, target)

            state = next_state

    # Replay experience
    for _ in range(batch_per_episode):
        agent.replay(batch_size)

    if e % 50 == 0:
        print(f"Episode {e + 1}/{episodes}, Total Reward: {total_reward}")
        agent.save(f"{model_name}_{e}.pth")

agent.save(f"{model_name}.pth")
print("End")

cuda
Episode 1/5000, Total Reward: tensor([-536.7311], device='cuda:0')




Episode 51/5000, Total Reward: tensor([-649.6022], device='cuda:0')
Episode 101/5000, Total Reward: tensor([-505.0512], device='cuda:0')
Episode 151/5000, Total Reward: tensor([-411.9357], device='cuda:0')
Episode 201/5000, Total Reward: tensor([-290.7041], device='cuda:0')
Episode 251/5000, Total Reward: tensor([-725.0445], device='cuda:0')
Episode 301/5000, Total Reward: tensor([-311.7230], device='cuda:0')
Episode 351/5000, Total Reward: tensor([-272.1707], device='cuda:0')
Episode 401/5000, Total Reward: tensor([-477.9097], device='cuda:0')
Episode 451/5000, Total Reward: tensor([-131.3875], device='cuda:0')
Episode 501/5000, Total Reward: tensor([-441.6857], device='cuda:0')
Episode 551/5000, Total Reward: tensor([-187.0427], device='cuda:0')
Episode 601/5000, Total Reward: tensor([-255.3547], device='cuda:0')
Episode 651/5000, Total Reward: tensor([-118.3151], device='cuda:0')
Episode 701/5000, Total Reward: tensor([-169.3453], device='cuda:0')
Episode 751/5000, Total Reward: ten

KeyboardInterrupt: 