Version1

In [None]:
import numpy as np

class QLearner:
    def __init__(self, num_states=100, num_actions=4, alpha=0.2, gamma=0.9, rar=0.5, radr=0.99, dyna=0, verbose=False):
        """Constructor method"""
        self.verbose = verbose
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.rar = rar
        self.radr = radr
        self.dyna = dyna
        self.s = 0
        self.a = 0
        self.q_table = np.zeros((num_states, num_actions))
        self.experiences = []

    def querysetstate(self, s):
        """Update the state without updating the Q-table"""
        self.s = s
        if np.random.uniform(0, 1) < self.rar:
            # Take a random action
            self.a = np.random.randint(self.num_actions)
        else:
            # Take the action with the highest Q-value
            self.a = np.argmax(self.q_table[s, :])

        if self.verbose:
            print(f"s = {s}, a = {self.a}")

        return self.a

    def query(self, s_prime, r):
        """Update the Q table and return an action"""
        # Update Q-table based on the reward and the maximum Q-value for next state
        self.q_table[self.s, self.a] = (1 - self.alpha) * self.q_table[self.s, self.a] + \
                                        self.alpha * (r + self.gamma * np.max(self.q_table[s_prime, :]))

        # Add this experience to the list of experiences
        self.experiences.append((self.s, self.a, r, s_prime))

        # Perform Dyna-Q update if enabled
        if self.dyna > 0:
            # Repeat experiences
            for _ in range(self.dyna):
                # Randomly sample an experience
                s, a, r, sp = self.experiences[np.random.randint(len(self.experiences))]

                # Update Q-table based on the sampled experience
                self.q_table[s, a] = (1 - self.alpha) * self.q_table[s, a] + \
                                     self.alpha * (r + self.gamma * np.max(self.q_table[sp]))

        # Update state, action, and rar
        self.s = s_prime
        self.a = self.querysetstate(s_prime)
        self.rar *= self.radr

        return self.a


Version2

In [None]:
import numpy as np

class QLearner:
    def __init__(self, num_states=100, num_actions=4, alpha=0.2, gamma=0.9, rar=0.5, radr=0.99, dyna=0, verbose=False):
        """Constructor method"""
        self.verbose = verbose
        self.num_states = num_states
        self.num_actions = num_actions
        self.alpha = alpha
        self.gamma = gamma
        self.rar = rar
        self.radr = radr
        self.dyna = dyna
        self.s = 0
        self.a = 0
        self.q_table = np.zeros((num_states, num_actions))
        self.T_count = np.zeros((num_states, num_actions, num_states))
        self.T = np.zeros((num_states, num_actions, num_states))
        self.R = np.zeros((num_states, num_actions))
        self.experienced_sa = []


    def querysetstate(self, s):
        """Update the state without updating the Q-table"""
        self.s = s
        if np.random.uniform(0, 1) < self.rar:
            # Take a random action
            self.a = np.random.randint(self.num_actions)
        else:
            # Take the action with the highest Q-value
            self.a = np.argmax(self.q_table[s, :])

        if self.verbose:
            print(f"s = {s}, a = {self.a}")

        return self.a

    def query(self, s_prime, r):
        # Update Q-table, T, and R based on the reward and the maximum Q-value for next state
        self.q_table[self.s, self.a] = (1 - self.alpha) * self.q_table[self.s, self.a] + \
                                        self.alpha * (r + self.gamma * np.max(self.q_table[s_prime]))

        # Update T and R
        self.T_count[self.s, self.a, s_prime] += 1
        if np.sum(self.T_count[self.s, self.a, :]) > 0:
            self.T[self.s, self.a, :] = self.T_count[self.s, self.a, :] / np.sum(self.T_count[self.s, self.a, :])

        self.R[self.s, self.a] = (1 - self.alpha) * self.R[self.s, self.a] + self.alpha * r

        # Add state-action pair to experienced list
        if (self.s, self.a) not in self.experienced_sa:
            self.experienced_sa.append((self.s, self.a))

        # Perform Dyna-Q update if enabled
        if self.dyna > 0:
            for _ in range(self.dyna):
                # Randomly sample experienced state-action pair
                s, a = self.experienced_sa[np.random.randint(len(self.experienced_sa))]
                s_prime = np.random.choice(np.arange(self.num_states), p=self.T[s, a, :])
                r = self.R[s, a]

                # Update Q-table based on the sampled experience
                self.q_table[s, a] = (1 - self.alpha) * self.q_table[s, a] + \
                                    self.alpha * (r + self.gamma * np.max(self.q_table[s_prime]))

        # Update state, action, and rar
        self.s = s_prime
        self.a = self.querysetstate(s_prime)
        self.rar *= self.radr

        return self.a

    
    def author(self): 
        return 'ydeng335' # replace tb34 with your Georgia Tech username. 

