In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import ControlSystem
from QNetwork import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "qnetwork_11act"
episodes = 2000
samples_per_target = 200
targets_per_episode = 5
batch_size = 16
batch_per_episode = 100

action_size = 11
agent = QLearningAgent(action_size=action_size, n=100, gamma=0.95,
                        epsilon=0.3, epsilon_decay=0.99999, epsilon_min=0.08,
                        learning_rate=0.00005 * batch_size, warmup_steps=3000, learning_rate_decay=0.99998,
                        stored_episodes=15, samples_per_episode=targets_per_episode * samples_per_target)


best_reward = -1000000
for e in range(episodes):
    # Reset the environment (PT1 system)
    pt1_with_delay = ControlSystem.PT1(K=2, T=5, delta_t=0.1, delay=2)
    total_reward = 0
    state = torch.tensor([pt1_with_delay.y_prev], device=device)

    for j in range(targets_per_episode):
        target = torch.rand((1,), device=device) * 2
        for k in range(samples_per_target):
            action = agent.act(state, target)

            control_signal = action / (action_size - 1)

            output = pt1_with_delay.calculate(control_signal)
            next_state = output

            # Calculate reward (negative of the absolute error)
            reward = -torch.abs(next_state - target)
            total_reward += reward

            # Store the experience
            agent.remember(state, action, reward, target)

            state = next_state

    # Replay experience
    for _ in range(batch_per_episode):
        agent.replay(batch_size)

    if e % 50 == 0:
        print(f"Episode {e + 1}/{episodes}, Total Reward: {total_reward}")
        agent.save(f"{model_name}_{e}.pth")

agent.save(f"{model_name}.pth")
print("End")

Episode 1/2000, Total Reward: tensor([-576.6440], device='cuda:0')




Episode 51/2000, Total Reward: tensor([-551.4729], device='cuda:0')
Episode 101/2000, Total Reward: tensor([-465.9258], device='cuda:0')
Episode 151/2000, Total Reward: tensor([-566.2161], device='cuda:0')
Episode 201/2000, Total Reward: tensor([-618.2343], device='cuda:0')
Episode 251/2000, Total Reward: tensor([-569.6149], device='cuda:0')
Episode 301/2000, Total Reward: tensor([-476.7028], device='cuda:0')
Episode 351/2000, Total Reward: tensor([-122.7113], device='cuda:0')
Episode 401/2000, Total Reward: tensor([-373.1590], device='cuda:0')
Episode 451/2000, Total Reward: tensor([-217.6268], device='cuda:0')
Episode 501/2000, Total Reward: tensor([-210.8472], device='cuda:0')
Episode 551/2000, Total Reward: tensor([-364.1454], device='cuda:0')
Episode 601/2000, Total Reward: tensor([-214.4722], device='cuda:0')
Episode 651/2000, Total Reward: tensor([-291.5023], device='cuda:0')
Episode 701/2000, Total Reward: tensor([-237.0614], device='cuda:0')
Episode 751/2000, Total Reward: ten

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import ControlSystem
from QNetwork import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_name = "qnetwork_11act_boltzmann"
episodes = 1150
samples_per_target = 200
targets_per_episode = 5
batch_size = 16
batch_per_episode = 100

action_size = 11
agent = QLearningAgentBoltzmann(action_size=action_size, n=100, gamma=0.95,
                        temperature=0.03, temperature_decay=0.99999, temperature_min=0.003,
                        learning_rate=0.00005 * batch_size, warmup_steps=3000, learning_rate_decay=0.99997,
                        stored_episodes=15, samples_per_episode=targets_per_episode * samples_per_target)


best_reward = -1000000
for e in range(episodes):
    # Reset the environment (PT1 system)
    pt1_with_delay = ControlSystem.PT1(K=2, T=5, delta_t=0.1, delay=2)
    total_reward = 0
    state = torch.tensor([pt1_with_delay.y_prev], device=device)

    for j in range(targets_per_episode):
        target = torch.rand((1,), device=device) * 2
        for k in range(samples_per_target):
            action = agent.act(state, target)

            control_signal = action / (action_size - 1)

            output = pt1_with_delay.calculate(control_signal)
            next_state = output

            # Calculate reward (negative of the absolute error)
            reward = -torch.abs(next_state - target)
            total_reward += reward

            # Store the experience
            agent.remember(state, action, reward, target)

            state = next_state

    # Replay experience
    for _ in range(batch_per_episode):
        agent.replay(batch_size)

    if e % 50 == 0:
        print(f"Episode {e + 1}/{episodes}, Total Reward: {total_reward}")
        agent.save(f"{model_name}_{e}.pth")

agent.save(f"{model_name}.pth")
print("End")

Episode 1/3000, Total Reward: tensor([-300.3538], device='cuda:0')




Episode 51/3000, Total Reward: tensor([-611.3771], device='cuda:0')
Episode 101/3000, Total Reward: tensor([-261.4688], device='cuda:0')
Episode 151/3000, Total Reward: tensor([-480.0842], device='cuda:0')
Episode 201/3000, Total Reward: tensor([-314.7133], device='cuda:0')
Episode 251/3000, Total Reward: tensor([-298.1442], device='cuda:0')
Episode 301/3000, Total Reward: tensor([-107.9486], device='cuda:0')
Episode 351/3000, Total Reward: tensor([-118.6755], device='cuda:0')
Episode 401/3000, Total Reward: tensor([-185.8259], device='cuda:0')
Episode 451/3000, Total Reward: tensor([-120.4475], device='cuda:0')
Episode 501/3000, Total Reward: tensor([-193.3480], device='cuda:0')
Episode 551/3000, Total Reward: tensor([-185.6931], device='cuda:0')
Episode 601/3000, Total Reward: tensor([-349.3200], device='cuda:0')
Episode 651/3000, Total Reward: tensor([-167.3294], device='cuda:0')
Episode 701/3000, Total Reward: tensor([-225.5406], device='cuda:0')
Episode 751/3000, Total Reward: ten

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from collections import deque
import ControlSystem
from QNetwork import *
import time

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model_name = "qnetwork_soft"

episodes = 1300
samples_per_episode = 200
targets_per_episode = 4
batch_size = 16
batch_per_episode = 100

agent = QLearningAgentSoft(action_search_batch=32,
                                 gamma=0.988,
                                 temperature=0.016,
                                 average_weight=0.5,
                                 learning_rate=0.00003 * batch_size, warmup_steps=1000,
                                 learning_rate_decay=0.99998)

for e in range(episodes):
    # Reset the environment (PT1 system)
    pt1_with_delay = ControlSystem.PT1(K=2, T=5, delta_t=0.1, delay=2)
    total_reward = 0
    state = torch.tensor([pt1_with_delay.y_prev], device=device)

    for j in range(targets_per_episode):
        target = torch.tensor([random.uniform(0, 2)], device=device)
        for k in range(samples_per_episode):
            control_signal, u, s = agent.act(state, target)
            next_state = pt1_with_delay.calculate(control_signal)

            # Calculate reward (negative of the absolute error)
            reward = -torch.abs(next_state - target)
            total_reward += reward

            # Store the experience
            agent.remember(reward, next_state, target)

            state = next_state

    # Replay experience
    for _ in range(batch_per_episode):
        agent.replay(batch_size)

    if e % 50 == 0:
        print(f"Episode {e + 1}/{episodes}, Total Reward: {total_reward}")
        agent.save(f"{model_name}_{e}.pth")

agent.save(f"{model_name}.pth")
print("End")

cuda
Episode 1/5000, Total Reward: tensor([-536.7311], device='cuda:0')




Episode 51/5000, Total Reward: tensor([-649.6022], device='cuda:0')
Episode 101/5000, Total Reward: tensor([-505.0512], device='cuda:0')
Episode 151/5000, Total Reward: tensor([-411.9357], device='cuda:0')
Episode 201/5000, Total Reward: tensor([-290.7041], device='cuda:0')
Episode 251/5000, Total Reward: tensor([-725.0445], device='cuda:0')
Episode 301/5000, Total Reward: tensor([-311.7230], device='cuda:0')
Episode 351/5000, Total Reward: tensor([-272.1707], device='cuda:0')
Episode 401/5000, Total Reward: tensor([-477.9097], device='cuda:0')
Episode 451/5000, Total Reward: tensor([-131.3875], device='cuda:0')
Episode 501/5000, Total Reward: tensor([-441.6857], device='cuda:0')
Episode 551/5000, Total Reward: tensor([-187.0427], device='cuda:0')
Episode 601/5000, Total Reward: tensor([-255.3547], device='cuda:0')
Episode 651/5000, Total Reward: tensor([-118.3151], device='cuda:0')
Episode 701/5000, Total Reward: tensor([-169.3453], device='cuda:0')
Episode 751/5000, Total Reward: ten

KeyboardInterrupt: 