In [1]:
import json
import math
import random
from collections import defaultdict
from itertools import count

import numpy as np
import gymnasium as gym
from gymnasium.spaces import Discrete
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import matplotlib.pyplot as plt

import games

In [2]:
envs = gym.vector.SyncVectorEnv([
    lambda: gym.make('games/Blackjack'),
    lambda: gym.make('games/Blackjack'),
    lambda: gym.make('games/Blackjack'),
    lambda: gym.make('games/Blackjack'),
    lambda: gym.make('games/Blackjack'),
    lambda: gym.make('games/Blackjack'),
    lambda: gym.make('games/Blackjack'),
    lambda: gym.make('games/Blackjack'),
    lambda: gym.make('games/Blackjack'),
    lambda: gym.make('games/Blackjack')
])

In [3]:
observations, infos = envs.reset()
action_spaces = envs.action_space
states = np.array(observations).T

In [4]:
class EpsilonGreedy:
    def __init__(self, epsilon=0.1):
        self.epsilon = epsilon
        
    def reset(self):
        pass
    
    def select(self, num_envs, states, action_spaces, qfunction):
        mask = np.random.rand(num_envs) < self.epsilon
        mask_inv = np.invert(mask)
        return action_spaces.sample() * mask + qfunction.get_argmax_q(states, action_spaces) * mask_inv


class EpsilonDecreasing:
    def __init__(self, epsilon=1.0, alpha=0.999, lower_bound=0.1):
        self.epsilon_greedy_bandit = EpsilonGreedy(epsilon)
        self.initial_epsilon = epsilon
        self.alpha = alpha
        self.lower_bound = lower_bound

    def reset(self):
        self.epsilon_greedy_bandit = EpsilonGreedy(self.initial_epsilon)

    def select(self, num_envs, states, action_spaces, qfunction):
        results = self.epsilon_greedy_bandit.select(num_envs, states, action_spaces, qfunction)
        self.epsilon_greedy_bandit.epsilon = max(
            self.epsilon_greedy_bandit.epsilon * self.alpha ** num_envs, self.lower_bound
        )
        return results

## DQN

In [11]:
class DeepQFunction(nn.Module):
    def __init__(self, state_space, action_space, hidden_dim=32, alpha=1e-4):
        super().__init__()
        self.layer1 = nn.Linear(state_space, hidden_dim)
        self.layer2 = nn.Linear(hidden_dim, hidden_dim)
        self.layer3 = nn.Linear(hidden_dim, action_space)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return self.layer3(x)

    def get_q(self, states, actions):
        states_tensor = torch.as_tensor(states, dtype=torch.float32)
        actions_tensor = torch.as_tensor(actions, dtype=torch.float32)
        with torch.no_grad():
            q_values = self.forward(states_tensor).gather(1, actions_tensor.unsqueeze(1))
        return q_values.squeeze(1).tolist()
    
    def get_max(self, states):
        state_tensor = torch.as_tensor(states, dtype=torch.float32)
        with torch.no_grad():
            max_q_values = self.forward(state_tensor).max(1).values
        return max_q_values.tolist()
    
    def get_argmax_q(self, states, action_spaces):
        pass
    
    
    # def get_max_pair(self, state, actions):
    #     state_tensor = torch.as_tensor(state, dtype=torch.float32)
    #     with torch.on_grad():
    #         q_values = self.forward(state_tensor)
    #     arg_max_q, max_q = None, float('-inf')
    #     for action in actions:
    #         q_value = q_values[action].item()
    #         if max_q < q_value:
    #             arg_max_q = action
    #             max_q = q_value
    #     return (arg_max_q, max_q)
    
    def batch_update(self, states, actions, deltas):
        states_tensor = torch.as_tensor(states, dtype=torch.float32)
        actions_tensor = torch.as_tensor(actions, dtype=torch.long)
        deltas_tensor = torch.as_tensor(deltas, dtype=torch.float32)
        q_values = self.forward(states_tensor).gather(1, actions_tensor.unsqueeze(1))
        loss = nn.functional.smooth_l1_loss(
            q_values,
            
        )
        pass
        

In [12]:
f = DeepQFunction(state_space=3, action_space=2)

In [13]:
states = torch.as_tensor(states, dtype=torch.float32)

In [19]:
logits = f.forward(states)

In [20]:
logits

tensor([[-1.3495,  0.9334],
        [-1.5174,  0.9594],
        [-1.6958,  1.1379],
        [-1.6173,  0.7624],
        [-1.8507,  0.9054],
        [-1.3644,  0.5227],
        [-1.6529,  1.0423],
        [-2.1724,  1.2958],
        [-1.7010,  0.7857],
        [-2.0722,  1.2161]], grad_fn=<AddmmBackward0>)

In [6]:
class QLearning:
    def __init__(self, envs, bandit, qfunction, gamma=0.9):
        self.envs = envs
        self.bandit = bandit
        self.qfunction = qfunction
        self.gamma = gamma
    
    def execute(self, episodes=2000):
        rewards = []
        for episode in range(episodes):
            observations, infos = self.envs.reset()
            states = observations
            action_spaces = self.envs.action_space
            actions = self.bandit.select(states, action_spaces, self.qfunction)
            episode_rewards = [0.0] 
            for step in count():
                (next_observation, reward, terminated, truncated, info) = self.envs.step(action)             
                next_state = tuple(
                    np.concatenate(
                        [next_observation['agent'], next_observation['target']]
                        )
                    )
                next_action = self.bandit.select(next_state, actions, self.qfunction)
                delta = self.get_delta(state, action, reward, next_state, next_action)
                self.qfunction.update(state, action, delta)
                state = next_state
                action = next_action
                episode_reward += reward * (self.gamma ** step)
                if terminated or truncated:
                    break
            rewards.append(episode_reward)
        return rewards
    
    def get_delta(self, states, actions, rewards, next_states, next_actions):
        # q_value = self.qfunction.get_q(state, action)
        # next_state_value = self.state_value(next_state, next_action)
        # delta = reward + self.gamma * next_state_value - q_value
        # return delta
        pass
    
    def state_value(self, state, action):
        # actions = self.env.action_space
        # max_q_value = self.qfunction.get_max_q(state, actions)
        # return max_q_value
        pass


def get_ema(rewards, smoothing_factor=0.9):
    smoothed_rewards = []
    for reward in rewards:
        if smoothed_rewards == []:
            smoothed_rewards = [reward]
        else:
            smoothed_rewards += [
                smoothed_rewards[-1] * smoothing_factor
                + reward * (1 - smoothing_factor)
            ]
    return smoothed_rewards

In [7]:
# gridworld = GridWorld()
# qfunction = DeepQFunction(state_space=len(gridworld.get_initial_state()), action_space=5)
# rewards = QLearning(gridworld, EpsilonGreedy(), qfunction).execute(episodes=200)