# Grid World --- past all grids (agent prefer to visit unvisited grids)
##### Charles Zhang
##### Jun 17

###  1. Cover and Get Back

In [5]:
import numpy as np

In [6]:
BOARD_ROWS = 4
BOARD_COLS = 5
START = (0, 0)
END = (0, 0)

In [7]:
class State:

    def __init__(self, state=START):
        self.board = np.zeros([BOARD_ROWS, BOARD_COLS])
        self.state = state    # tuple of the coordinate
        self.is_end = False

    def check_end(self):
        if self.state == END:
            self.is_end = True

    def next_position(self, action):
        if action == "up":
            next_state = (self.state[0] - 1, self.state[1])
        elif action == "down":
            next_state = (self.state[0] + 1, self.state[1])
        elif action == "left":
            next_state = (self.state[0], self.state[1] - 1)
        else:
            next_state = (self.state[0], self.state[1] + 1)
        # boundary condition 
        if (next_state[0] >= 0) and (next_state[0] < BOARD_ROWS):
            if (next_state[1] >= 0) and (next_state[1] < BOARD_COLS):
                return next_state
        return self.state

In [10]:
class Agent:

    def __init__(self):
        self.actions = ["up", "down", "left", "right"]      # space
        self.State = State()
        self.is_end = self.State.is_end
        self.alpha = 0.3      # learning rate
        self.exp_rate = 1     # epsilon-greedy parameter
        self.decay_gamma = 0.9
        self.Q_values = {}             # init Q table (dict)
        for i in range(BOARD_ROWS):    
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0
        # init a list to check if each grid is past          
        self.past_all = np.zeros((BOARD_ROWS, BOARD_COLS), dtype=bool).tolist()
        self.steps = []         # store the steps for each episode 
    
    def optimal_action(self):
        # greedy move
        max_value = -10000
        action = ""
        for a in self.actions:
            next_value = self.Q_values[self.State.state][a]
            if next_value >= max_value:
                action = a
                max_value = next_value
        return action
    
    def get_action(self):
        """
        The agent should choose randomly among the positions that have
        not been visited, and if all possible positions are visited, 
        then move randomly and receive a negative reward
        """
        if np.random.uniform(0, 1) <= self.exp_rate:
            # Get four potential positions of the current state
            t = []
            for a in self.actions:
                p = self.State.next_position(a)
                if self.past_all[p[0]][p[1]] is True:
                    t.append(1)
                else:
                    t.append(0)
            if sum(t) == len(t):
                return np.random.choice(self.actions)
            # Else try to get an available unvisited position randomly
            else:
                while True:
                    action = np.random.choice(self.actions)
                    next_state = self.State.next_position(action)
                    if self.past_all[next_state[0]][next_state[1]] is False:
                        return action
                    else:
                        continue
        else:
            return self.optimal_action()
    
    def give_reward(self):
        """
        + 1 reward for visiting the unvisited grid
        -.1 reward for visiting the visited grid
        """
        reward = 0
        if self.past_all[self.State.state[0]][self.State.state[1]] is False:
            reward += 1
        else:
            reward -= .1
        return reward
    
    def set_action(self, action):
        # set current stat past
        curr_state = self.State.state
        self.past_all[curr_state[0]][curr_state[1]] = True
        # get the next state
        next_state = self.State.next_position(action)  
        self.State = State(state=next_state)
        reward = self.give_reward()
        # set the next state past
        if self.past_all[next_state[0]][next_state[1]] is False:
            self.past_all[next_state[0]][next_state[1]] = True
        # give the global reward if finish an epsiode
        if self.State.state==START and self.check_all_past():
            reward += 1
        # update the Q table
        qs_of_next_state = []
        for q_value in self.Q_values[next_state]:  
            qs_of_next_state.append(self.Q_values[next_state][q_value])
        delta = self.alpha*(reward + self.decay_gamma*(max(qs_of_next_state)) -
                            self.Q_values[curr_state][action])
        self.Q_values[curr_state][action] = round(self.Q_values[curr_state][action]+delta, 4)

    def reset(self):
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.past_all[i][j] = False
        self.State = State()
        self.is_end = self.State.is_end
    
    def check_all_past(self):
        for i in self.past_all:
            for j in i:
                if j is False:
                    return False
        return True
    
    def show_path(self):
        for i in range(BOARD_ROWS):
            print('---------------------------------------')
            row_string = "| "
            for j in range(BOARD_COLS):
                best_move = ""
                best_val = -1000
                for a in self.Q_values[(i,j)]:
                    if self.Q_values[(i,j)][a] > best_val:
                        best_val = self.Q_values[(i,j)][a]
                        best_move = a            
                row_string = row_string + " " + best_move + " |"
            print(row_string)
        print('---------------------------------------')

    def train(self, rounds=1000):
        print("Training...")
        for r in range(rounds): 
            self.reset()
            self.exp_rate *= 0.99  # decaying epsilon-greedy
            step = 0
            while True:
                action = self.get_action()
                self.set_action(action)
                self.State.check_end()
                self.is_end = self.State.is_end
                step += 1
                if self.is_end and self.check_all_past():
                    break
            self.steps.append(step)
        print("Training finished!")

In [11]:
agent = Agent()
agent.train()

Training...
Training finished!


In [12]:
agent.Q_values

{(0, 0): {'down': -0.9633, 'left': -0.9931, 'right': 9.9735, 'up': -0.9722},
 (0, 1): {'down': -0.9604, 'left': -0.9932, 'right': 9.9716, 'up': -0.9931},
 (0, 2): {'down': 9.9697, 'left': -0.9914, 'right': -0.931, 'up': -0.9915},
 (0, 3): {'down': -0.9915, 'left': -0.9917, 'right': 9.9631, 'up': -0.964},
 (0, 4): {'down': 9.9606, 'left': -0.9928, 'right': -0.9128, 'up': -0.8829},
 (1, 0): {'down': -0.967, 'left': -0.9931, 'right': -0.9051, 'up': 9.8756},
 (1, 1): {'down': -0.9802, 'left': 9.8872, 'right': -0.9664, 'up': -0.9919},
 (1, 2): {'down': -0.9917, 'left': -0.9597, 'right': 9.9676, 'up': -0.9917},
 (1, 3): {'down': 2.2666, 'left': -0.9917, 'right': 2.3402, 'up': 9.9654},
 (1, 4): {'down': 9.9577, 'left': -0.9621, 'right': -0.9924, 'up': -0.9933},
 (2, 0): {'down': -0.9233, 'left': -0.9772, 'right': 9.9073, 'up': -0.8686},
 (2, 1): {'down': -0.8777, 'left': -0.9914, 'right': -0.9913, 'up': 9.8977},
 (2, 2): {'down': 9.9365, 'left': -0.8815, 'right': -0.9915, 'up': -0.9932},
 (2,

In [15]:
import matplotlib.pyplot as plt
%matplotlib inline



x = []
for i in range(1000):
    x.append(i)

def smooth(y, box_pts):
    box = np.ones(box_pts)/box_pts
    y_smooth = np.convolve(y, box, mode='same')
    return y_smooth

y = agent.steps

plt(x, smooth(y,3), 'r-', lw=2)



plt.ylabel('Number of Steps')
plt.xlabel('Episode')
plt.title('Q Learning Convergence')
plt.show()

TypeError: 'module' object is not callable

In [None]:
min(agent.steps)

#### Optimal Policy

In [None]:
agent.show_path()

### 2. If the connected graph does not have a Euler's Path

In [None]:
BOARD_ROWS = 3
BOARD_COLS = 3
START = (0, 0)
END = (0, 0)
agent = Agent()
agent.train(rounds=100)
x = []
for i in range(100):
    x.append(i)
y = agent.steps
plt.plot(x,y)
plt.ylabel('Number of Steps')
plt.xlabel('Episode')
plt.title('Q Learning Convergence')
plt.show()

#### If I set up the restriction that the agent should come back once covered all grids, not surprisingly, values in Q tables are very closed. 

In [None]:
agent.Q_values

In [None]:
min(agent.steps)

### 3. Task for only coverage not getting back

In [None]:
BOARD_ROWS = 3
BOARD_COLS = 3
START = (0, 0)
# END = (0, 0)

class Agent:
    """
    Only coverage not get back 
    Same code except for cheking end
    """

    def __init__(self):
        self.actions = ["up", "down", "left", "right"]      # space
        self.State = State()
        self.is_end = self.State.is_end
        self.alpha = 0.3
        self.exp_rate = 1
        self.decay_gamma = 0.9
        self.Q_values = {}             # init Q values (dict)
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.Q_values[(i, j)] = {}
                for a in self.actions:
                    self.Q_values[(i, j)][a] = 0
        self.past_all = []
        for i in range(BOARD_ROWS):
            t = []
            for j in range(BOARD_COLS): 
                t.append(False)
            self.past_all.append(t)
        self.steps = []
    
    def optimal_action(self):
        max_value = -100
        action = ""
        for a in self.actions:
            next_value = self.Q_values[self.State.state][a]
            if next_value >= max_value:
                action = a
                max_value = next_value
        return action
    
    def get_action(self):
        if np.random.uniform(0, 1) <= self.exp_rate:
             t = []
            for a in self.actions:
                p = self.State.next_position(a)
                if self.past_all[p[0]][p[1]] is True:
                    t.append(1)
                else:
                    t.append(0)
            if sum(t) == len(t):
                return np.random.choice(self.actions)
            else:
                while True:
                    action = np.random.choice(self.actions)
                    next_state = self.State.next_position(action)
                    if self.past_all[next_state[0]][next_state[1]] == False:
                        return action
                    else:
                        continue
        else:
            return self.optimal_action()
    
    def give_reward(self):
        reward = 0
        if self.past_all[self.State.state[0]][self.State.state[1]] is False:
            reward += 1
        else:
            reward -= .1
        return reward
    
    def set_action(self, action):
        curr_state = self.State.state
        self.past_all[curr_state[0]][curr_state[1]] = True
        next_state = self.State.next_position(action)  
        self.State = State(state=next_state)
        reward = self.give_reward()
        if self.past_all[next_state[0]][next_state[1]] is False:
            self.past_all[next_state[0]][next_state[1]] = True
#         if self.State.state==START and self.check_all_past():
#             reward += 1
        qs_of_next_state = []
        for q_value in self.Q_values[next_state]:  
            qs_of_next_state.append(self.Q_values[next_state][q_value])
        delta = self.alpha*(reward + self.decay_gamma*(max(qs_of_next_state)) -
                            self.Q_values[curr_state][action])
        self.Q_values[curr_state][action] = round(self.Q_values[curr_state][action]+delta, 4)
        
    def reset(self):
        for i in range(BOARD_ROWS):
            for j in range(BOARD_COLS):
                self.past_all[i][j] = False
        self.State = State()
        self.is_end = self.State.is_end
    
    def check_all_past(self):
        for i in self.past_all:
            for j in i:
                if j is False:
                    return False
        return True
    
     def show_path(self):
        for i in range(BOARD_ROWS):
            print('---------------------------------------')
            row_string = "| "
            for j in range(BOARD_COLS):
                best_move = ""
                best_val = -1000
                for a in self.Q_values[(i,j)]:
                    if self.Q_values[(i,j)][a] > best_val:
                        best_val = self.Q_values[(i,j)][a]
                        best_move = a            
                row_string = row_string + " " + best_move + " |"
            print(row_string)
        print('---------------------------------------')
        
    def train(self, rounds=100):
        print("Training...")
        for r in range(rounds): 
            self.reset()
            self.exp_rate *= 0.9
            step = 0
            while True:
                action = self.get_action()
                self.set_action(action)
                self.State.check_end()
                self.is_end = self.State.is_end
                step += 1
                #if (self.is_end == True) and (self.check_all_past() == True):
                if self.check_all_past() == True:
                    break
            self.steps.append(step)
        print("Training finished!")

In [None]:
agent = Agent()
agent.train()
x = []
for i in range(100):
    x.append(i)
y = agent.steps
plt.plot(x,y)
plt.ylabel('Number of Steps')
plt.xlabel('Episode')
plt.title('Q Learning Convergence')
plt.show()

In [None]:
agent.Q_values

In [None]:
min(agent.steps)

#### Optimal  Policy

In [None]:
agent.show_path()