In [None]:
# Install basic libraries
# !pip install numpy matplotlib

In [None]:
import numpy as np
import matplotlib.pyplot as plt
np.set_printoptions(precision=1)


class Vector:
    def __init__(self, x=0, y=0):
        self.x = x
        self.y = y

    def __str__(self):
        return f"({self.x}, {self.y})"

    def __eq__(self, other):
        return (self.x == other.x) and (self.y == other.y)

    def __add__(self, other):
        return Vector(self.x + other.x, self.y + other.y)

    def __sub__(self, other):
        return Vector(self.x - other.x, self.y - other.y)

    def __truediv__(self, val):
        return Vector(self.x / val, self.y / val)

    def __iter__(self):
        return iter((self.x, self.y))
    
    def copy(self):
        return Vector(self.x, self.y)

    def clip_update(self, new_vec, bounds):
        # Doesn't let the position go out of bounds
        # while retaining the component along the walls
        
        if new_vec.x < 0:
            self.x = 0
        elif new_vec.x >= bounds.x:
            self.x = bounds.x - 1
        else:
            self.x = new_vec.x

        if new_vec.y < 0:
            self.y = 0
        elif new_vec.y >= bounds.y:
            self.y = bounds.y - 1
        else:
            self.y = new_vec.y

In [None]:
class GridWorld:
    def __init__(self):
        """
        Present atleast one set of evaluations for these
        default environments. But DO NOT change the rest
        of the code unless mentioned, changing anything
        else might lead to disqualification.
        """
        # self.environment = np.array([
        #     ['S', 'F', 'F', 'F'],
        #     ['F', 'O', 'F', 'O'],
        #     ['F', 'F', 'F', 'O'],
        #     ['O', 'F', 'F', 'G'],
        # ])

        self.environment = np.array([
            ['S','F','F','F','O','F'],
            ['F','F','F','F','F','F'],
            ['F','F','O','O','F','F'],
            ['F','F','O','O','F','F'],
            ['F','F','O','O','F','F'],
            ['F','F','O','O','F','F'],
            ['F','F','F','F','F','F'],
            ['0','F','F','F','F','G'],
        ])

        self.size = Vector(*self.environment.shape)
        self.termination_states = []
        
        for y in range(self.environment.shape[0]):
            for x in range(self.environment.shape[1]):
                entry = self.environment[y, x]
                pos = Vector(x, y)

                if entry == 'S':
                    self.start_pos = pos
                elif entry == 'O':
                    self.termination_states.append(pos)
                elif entry == 'G':
                    self.goal_pos = pos
                    self.termination_states.append(pos)

        self.actions = {
            0: Vector(0, -1), # N
            1: Vector(1, 0),  # E
            2: Vector(-1, 0), # W
            3: Vector(0, 1),  # S
        }

        """
        Drift from original direction of control
        to simulate noise in control dynamics.
        """
        self.drift_dir_p = {
            'front': 0.7,
            'left': 0.125,
            'right': 0.125,
            'back': 0.05
        }


    def get_drift_dir(self, agent_action):
        p = np.random.rand()
        p_ = self.drift_dir_p['front']

        x, y = agent_action
        if p < p_: return agent_action
        
        p_ += self.drift_dir_p['left']
        if p < p_: return Vector(-y, x)

        p_ += self.drift_dir_p['right']
        if p < p_: return Vector(y, -x)
        
        return Vector(-x, -y)

    def get_state_transition(self, agent_pos, agent_action):
        new_pos = agent_pos + self.get_drift_dir(agent_action)
        agent_pos.clip_update(new_pos, bounds=self.size)

        return agent_pos

    def agent_terminated(self, agent):
        return agent.get_state() in self.termination_states or agent.lifespan <= 0
    
    def get_reward(self, agent_pos):
        # TODO
        raise NotImplementedError

In [None]:
class Agent:
    def __init__(self, world, lifespan):
        self.world = world
        self.lifespan = lifespan

        self.pos = Vector(0, 0)
        self.actions = self.world.actions

        # Agent's model
        self.epsilon = 0.1  # for epsilon-greedy function, can
                            # change this while training to
                            # approach 0 as training completes
                            
        self.action_value = np.zeros(
            (self.world.size.x, self.world.size.y, len(self.actions))
        )

    def init(self):
        self.pos = self.world.start_pos.copy()

    def get_state(self):
        return (self.pos.x, self.pos.y)

    def sample_action(self):
        if np.random.rand() < self.epsilon:
            action = np.random.choice(len(self.actions))
        else:
            state = tuple(self.get_state())
            action = np.argmax(self.action_value[state])

        return int(action)

    def get_state_action(self, action):
        state = self.get_state()
        return tuple(list(state) + [action])

    def update_state(self, action):
        # Returns reward

        self.lifespan -= 1

        self.pos = self.world.get_state_transition(self.pos, self.actions[action])
        reward = self.world.get_reward(self.pos)

        return self.pos, reward

In [None]:
class Trainer():
    def __init__(self, cfg):
        """
        Use more config variables if you like
        """
        
        self.world = cfg['world']
        self.agent = Agent(self.world, cfg['lifespan'])

        self.max_episodes = cfg['max_episodes']
        self.lifespan = cfg['lifespan']

        self.gamma = cfg['gamma']
        self.alpha = cfg['alpha']


    def sarsa_train(self):
        # TODO
        pass


    def q_learning_train(self):
        # TODO
        pass


    def train(self, mode):
        if mode == 'sarsa':
            self.sarsa_train()
        elif mode == 'q-learning':
            self.q_learning_train()
        else:
            raise NotImplementedError

In [None]:
world = GridWorld()

cfg = {
    'world': world,
    
    'max_episodes': 1000,
    'lifespan': 100,

    'gamma': 0.99,
    'alpha': 0.5
}

Trainer(cfg).train(mode='sarsa')
Trainer(cfg).train(mode='q-learning')