<h1 align="center" style="font-family:Times New Roman;"> 
    <b> IML project : QLearning Multiagent </b>
</h1>

<h1 align="center" style="font-family:Times New Roman;"> 
    <b> Police, Thief and the Diamond </b>
</h1>

<h2 align="right">By:<br>
    Samratsinh Sudhirsinh Dhumal<br>
    Akshaykumar Rao Racherla<br>
    Akhil Koppera<br>
</h2>

Aim of the project is to automate a game using approximate dynamic programming. 
The agent in the game will be self-learning and will change its tactics as per the environment.

### Importing Packages

In [None]:
import sys
import numpy as np
import pickle
import random
import pygame
import random
import time

from collections import defaultdict
from collections import deque 

### Defining Colours

In [None]:
#COLOURS = (R, G, B)

WHITE = (255,255,255)
GRAY = (105, 105, 105)
N_BLUE = (0,0,128)
RED = (255, 0, 0)
GREEN = (0, 255, 0)
AQUA = (0 ,128, 128)
TEAL = (0, 128, 128)

### Importing Grid File and saving the obstacles as an array

In [None]:
file = open('Grid_Files/grid1.txt', 'r') 
Lines = file.readlines() 
obstacles=[] 
count = 0
# Strips the newline character 
for line in Lines:
    j=0
    for ch in line:
        if ch == ' ':
            j+=1
        else :
            j+=1
            obstacles.append([j-1,count])
    count+=1

In [None]:
obstacles

<h3 align="center">
    Q - Table Equation
</h3>

![title](pics/ql.jpeg)

### Defining Agents

In [None]:
class Q_Agent:

    def __init__(self, env, alpha, nA, gamma=1.0, eps_start=1.0, eps_decay=0.9999, eps_min=0.05):
        self.env = env
        self.eps_start = eps_start
        self.gamma = gamma
        self.alpha = alpha
        self.eps_decay = eps_decay
        self.eps_min = eps_min
        self.nA = nA
        self.Q = defaultdict( lambda: np.zeros(self.nA))  #The Q-TABLE

############################################################################################################################
#                                               gets a Greedy action
############################################################################################################################

    def greedy_action(self, state, epsilon):
        '''
            Chooses the best possible action with a probability of 1-epsilon, or random action
        '''

        prob = random.random()
        if prob > epsilon:
            return np.argmax(self.Q[state])
        else:
            return np.random.choice(np.arange(self.nA))

############################################################################################################################
#                                               Learn from the environment
############################################################################################################################

    def learn(self, state, action, reward, next_state):
        '''
            updates the Q-table
        ''' 
        self.Q[state][action] += self.alpha*(reward + self.gamma*np.max(self.Q[next_state]) - self.Q[state][action])

############################################################################################################################
#                                  Call this after learning to set a concrete policy
############################################################################################################################

    def set_policy(self):
        '''
            sets the optimal policy of agent
        '''
        policy = defaultdict(lambda: 0)
        for state, action in self.Q.items():
            policy[state] = np.argmax(action)
        self.policy = policy

############################################################################################################################
#                                                   Take action
############################################################################################################################
        
    def take_action(self,state):
        '''
            take action as per policy
        '''
        return self.policy[state]

############################################################################################################################
#                                  Save the agents policy and Load them
############################################################################################################################
    
    def save(self,i):
        try:
            policy = dict(self.policy)
            with open(f'policy{i}.pickle','wb') as f:
                pickle.dump(policy, f)
        except :
            print('not saved')

            
    def change_policy(self, directory):
        '''
            To be used while loading saved policies
        '''
        with open(directory, 'rb') as f:
            policy_new = pickle.load(f)
        self.policy = defaultdict(lambda:0, policy_new)  #saved as defaultdict
        print('policy Loaded')        

### Defining Police

In [None]:
class Police():

    def __init__(self, gameDisplay, width, height):
        self.DISPLAY = gameDisplay
        self.WIDTH = width 
        self.HEIGHT = height
        
        self.IMG = pygame.image.load('pics/police.png')
        self.IMG = pygame.transform.scale(self.IMG, (self.WIDTH, self.HEIGHT))


    def draw(self, x, y):
        self.DISPLAY.blit(self.IMG, (x*self.WIDTH, y*self.HEIGHT))

### Defining Thief

In [None]:
class Thief(Q_Agent):

    def __init__(self, gameDisplay, width, height):
        self.DISPLAY = gameDisplay
        self.WIDTH = width 
        self.HEIGHT = height
    
        self.IMG = pygame.image.load('pics/thief.png')
        self.IMG = pygame.transform.scale(self.IMG, (self.WIDTH, self.HEIGHT))

    def draw(self, x, y):
        self.DISPLAY.blit(self.IMG, (x*self.WIDTH, y*self.HEIGHT))  

### Defining Environment

In [None]:
class Game_Env():

    def __init__(self, gameDisplay, game_matrix):

        self.HEIGHT = game_matrix.COLUMNS  #height of the environment
        self.WIDTH = game_matrix.ROWS   #width of the environment

        self.DISPLAY = gameDisplay   #will be used for rendering
        
        display_width, display_height = gameDisplay.get_size()
        display_height -= 100   #since we need some space to show important data.
        
        self.BLOCK_WIDTH = int(display_width/self.WIDTH)
        self.BLOCK_HEIGHT = int(display_height/self.HEIGHT)

        #defining agents
        self.POLICE = Police(self.DISPLAY, self.BLOCK_WIDTH, self.BLOCK_HEIGHT)
        self.THIEF = Thief(self.DISPLAY, self.BLOCK_WIDTH, self.BLOCK_HEIGHT)
        self.MOVES = {'thief':150,'police':150}

        self.OBSTACLES = game_matrix.OBSTACLES

        #and finally the golden diamond
        self.DIAMOND_IMG = pygame.transform.scale(pygame.image.load('pics/diamond.png'),
                                                 (self.BLOCK_WIDTH, self.BLOCK_HEIGHT))
        
############################################################################################################################
#                                         Returns the state of the environment
############################################################################################################################

    def get_state(self):
        
        #later on give diamond state to police also
        
        self.STATE = {'thief':(self.THIEF_X,self.THIEF_Y,self.DIAMOND_X,self.DIAMOND_Y,
                               self.THIEF_X - self.POLICE_X, self.THIEF_Y - self.POLICE_Y,
                               self.THIEF_X - self.DIAMOND_X,self.THIEF_Y -  self.DIAMOND_Y), 
                        'police':(self.POLICE_X,self.POLICE_Y,self.THIEF_X,self.THIEF_Y,
                                  self.POLICE_X - self.THIEF_X, self.POLICE_Y - self.THIEF_Y)}   

        return self.STATE

############################################################################################################################
#                                                   Environment reset
############################################################################################################################
   
    def reset(self):
        
        while True:
            Y,X= np.random.randint(0, count, 2, 'int')
            theif_pos=[Y,X]
            if theif_pos in obstacles:
                continue
            else:
                break
        self.THIEF_X, self.THIEF_Y = theif_pos
        while True:
            Y,X= np.random.randint(0, count, 2, 'int')
            police_pos=[Y,X]
            if police_pos in obstacles:
                continue
            else:
                break
                
        self.POLICE_X, self.POLICE_Y = police_pos
        while True:
            Y,X= np.random.randint(0, count, 2, 'int')
            dia_pos=[Y,X]
            if dia_pos in obstacles:
                continue
            else:
                break
        self.DIAMOND_X, self.DIAMOND_Y = (Y,X)

        #making sure diamond is not at obstacles
        for obs in self.OBSTACLES:
            if self.DIAMOND_X == obs[0] and self.DIAMOND_Y == obs[1]:
                #then shift it up
                    self.DIAMOND_Y -= 1

        self.MOVES['thief'] = 1000
        self.MOVES['police'] = 1000

        return self.get_state()

############################################################################################################################
#                                             Rendering the environment
############################################################################################################################
 
    def render(self, i_episode=-1):
        '''
            rendering the environment using pygame display
        '''

        #drawing our agents
        self.THIEF.draw(self.THIEF_X, self.THIEF_Y)
        self.POLICE.draw(self.POLICE_X, self.POLICE_Y)
        
        self.DISPLAY.blit(self.DIAMOND_IMG, (self.DIAMOND_X*self.BLOCK_WIDTH, self.DIAMOND_Y*self.BLOCK_HEIGHT))

        #drawing obstacles
        for pos in self.OBSTACLES:
            pygame.draw.rect(self.DISPLAY, GRAY, [pos[0]*self.BLOCK_WIDTH, pos[1]*self.BLOCK_HEIGHT, self.BLOCK_WIDTH, self.BLOCK_HEIGHT])

        if i_episode>=0:
            self.display_episode(i_episode)

############################################################################################################################
#                                  Agents takes step and the environment changes
############################################################################################################################

    def step(self,thief_action, police_action):

        reward = {'thief':-1, 'police':-1}
        done = False
        info = {
            'diamond_stolen': False,
            'thief_caught': False, 
            'x': -1, 'y': -1, 
            'width':self.BLOCK_WIDTH, 
            'height':self.BLOCK_HEIGHT
        }

        #decreasing the no. of moves
        self.MOVES['police'] -= 1
        self.MOVES['thief'] -= 1
        #done if moves = 0
        if self.MOVES['police'] == 0 or self.MOVES['thief'] == 0:
            done = True

        self.update_positions(thief_action, police_action)
        
        for obs in self.OBSTACLES:
            if self.THIEF_X == obs[0] and self.THIEF_Y == obs[1]:
                reward['thief'] = -float('inf') 
                self.THIEF_X = X_thief
                self.THIEF_Y = Y_thief            

            if self.POLICE_X == obs[0] and self.POLICE_Y == obs[1]:    
                reward['police'] = -float('inf')
                self.POLICE_X = X_police
                self.POLICE_Y = Y_police
                
        #thief reached the diamond
        if self.THIEF_X == self.DIAMOND_X and self.THIEF_Y == self.DIAMOND_Y:
            done = True
            reward['thief'] = 50
            info['diamond_stolen'], info['x'], info['y'] = True,  self.THIEF_X, self.THIEF_Y
        
        #police caught the thief
        if self.POLICE_X == self.THIEF_X and self.POLICE_Y == self.THIEF_Y:
            done = True
            reward['police'] = 50
            reward['thief'] = -20
            info['thief_caught'], info['x'], info['y'] = True,  self.THIEF_X, self.THIEF_Y      
                
        return self.get_state(), reward, done, info

    #-----------------------------------------------------------------------------------------------
    
    def step2(self,thief_action, police_action):

        reward = {'thief':-1, 'police':-1}
        done = False
        info = {
            'diamond_stolen': False,
            'thief_caught': False, 
            'x': -1, 'y': -1, 
            'width':self.BLOCK_WIDTH, 
            'height':self.BLOCK_HEIGHT
        }

        #decreasing the no. of moves
        self.MOVES['police'] -= 1
        self.MOVES['thief'] -= 1
        #done if moves = 0
        if self.MOVES['police'] == 0 or self.MOVES['thief'] == 0:
            done = True

        self.update_positions2(thief_action, police_action)
        
        for obs in self.OBSTACLES:
            if self.THIEF_X == obs[0] and self.THIEF_Y == obs[1]:
                reward['thief'] = -float('inf') 
                self.THIEF_X = X_thief
                self.THIEF_Y = Y_thief            

            if self.POLICE_X == obs[0] and self.POLICE_Y == obs[1]:    
                reward['police'] = -float('inf')
                self.POLICE_X = X_police
                self.POLICE_Y = Y_police
                
        #thief reached the diamond
        if self.THIEF_X == self.DIAMOND_X and self.THIEF_Y == self.DIAMOND_Y:
            done = True
            reward['thief'] = 50
            info['diamond_stolen'], info['x'], info['y'] = True,  self.THIEF_X, self.THIEF_Y
        
        #police caught the thief
        if self.POLICE_X == self.THIEF_X and self.POLICE_Y == self.THIEF_Y:
            done = True
            reward['police'] = 50
            reward['thief'] = -20
            info['thief_caught'], info['x'], info['y'] = True,  self.THIEF_X, self.THIEF_Y      
                
        return self.get_state(), reward, done, info


############################################################################################################################
#                               Function to show number of Episodes
############################################################################################################################

    def display_episode(self,epsiode):
        font = pygame.font.SysFont(None, 30)
        text = font.render("Episode: "+str(epsiode), True, N_BLUE)
        self.DISPLAY.blit(text,(50,610))

############################################################################################################################
#                                  Decide position changes based on action taken
############################################################################################################################

    def get_changes(self, action):
        x_change, y_change = 0, 0

        #decide action
        if action == 0:
            x_change = -1  #moving left
        elif action == 1:
            x_change = 1   #moving right
        elif action == 2:
            y_change = -1 #moving upwards
        elif action ==3:
            y_change = 1  #moving downwards
        
        return x_change, y_change
    
############################################################################################################################
#                                            updating positions of agents
############################################################################################################################

    def update_positions(self, thief_action, police_action):
        x_change_thief, y_change_thief = self.get_changes(thief_action)
        global X_thief
        global Y_thief
        global X_police
        global Y_police
        X_thief = self.THIEF_X
        Y_thief = self.THIEF_Y
        X_police = self.POLICE_X
        Y_police = self.POLICE_Y
        self.THIEF_X += x_change_thief 
        self.THIEF_Y += y_change_thief
        x_change_police, y_change_police = self.get_changes(police_action)
        self.POLICE_X += x_change_police 
        self.POLICE_Y += y_change_police 
        
        self.THIEF_X, self.THIEF_Y = self.fix(self.THIEF_X, self.THIEF_Y)
        self.POLICE_X, self.POLICE_Y = self.fix(self.POLICE_X, self.POLICE_Y)
        
    def update_positions2(self, thief_action, police_action):
        x_change_thief, y_change_thief = self.get_changes(thief_action)
        global X_thief
        global Y_thief
        global X_police
        global Y_police
        X_thief = self.THIEF_X
        Y_thief = self.THIEF_Y
        X_police = self.POLICE_X
        Y_police = self.POLICE_Y        
        
        while True:
            x_change_thief, y_change_thief = self.get_changes(thief_action)
            X = self.THIEF_X + x_change_thief 
            Y = self.THIEF_Y + y_change_thief
            if [X,Y] in obstacles:
                thief_action= np.random.choice(4)
                continue
            else:
                self.THIEF_X = X
                self.THIEF_Y = Y
                break
        while True:   
            x_change_police, y_change_police = self.get_changes(police_action)          
            X = self.POLICE_X + x_change_police 
            Y = self.POLICE_Y + y_change_police
            if [X,Y] in obstacles:
                police_action= np.random.choice(4)
#                 print(self.POLICE_X,self.POLICE_Y,police.greedy_action(state['police'], 0.1) )
                continue
            else:
                self.POLICE_X = X
                self.POLICE_Y = Y
                break
        
        self.THIEF_X, self.THIEF_Y = self.fix(self.THIEF_X, self.THIEF_Y)
        self.POLICE_X, self.POLICE_Y = self.fix(self.POLICE_X, self.POLICE_Y)

        
############################################################################################################################
#                                  Reprocessing to put the agents back in the grid
############################################################################################################################

    def fix(self, x, y):
        # If agents out of bounds, fix!
        if x < 0:
            x = 0
        elif x > self.WIDTH-1:
            x = self.WIDTH-1
        if y < 0:
            y = 0
        elif y > self.HEIGHT -1:
            y = self.HEIGHT -1

        return x, y

############################################################################################################################
#                    Game_Matrix class that abstracts the specific fstolures of game from environment
############################################################################################################################

class Game_Matrix:

    def __init__(self, rows=j-1, columns=count):
        self.ROWS = rows 
        self.COLUMNS = columns
        self.OBSTACLES = obstacles

### Training the model

In [None]:
############################################################################################################################
#                                    Training the agents on the grid
############################################################################################################################

display_width, display_height = 600, 700

pygame.init()
pygame.display.set_caption('Grid environment')
gameDisplay = pygame.display.set_mode((display_width,display_height))
clock = pygame.time.Clock()

game_matrix = Game_Matrix()
env = Game_Env(gameDisplay, game_matrix)

#initialising the agents
police = Q_Agent(env, alpha = 0.1, nA = 4)
police_tmp = pickle.load(open('police_q.p','rb'))
thief = Q_Agent(env, alpha = 0.1, nA = 4)
thief_tmp = pickle.load(open('thief_q.p','rb'))

for k in police_tmp.keys():
    police.Q[k] = police_tmp[k]
for k in thief_tmp.keys():
    thief.Q[k] = thief_tmp[k]


def show_info(diamond, thief):
    pygame.draw.rect(gameDisplay, TEAL, [0, 600, 600, 5])
    font = pygame.font.SysFont(None, 30)
    text1 = font.render("Total Diamond Stolen: "+str(diamond), True, AQUA)
    text2 = font.render("Total Thief Caught: "+str(thief), True, RED)
    
    gameDisplay.blit(text1,(50,640))
    gameDisplay.blit(text2,(50,670))

#indicative rectangle to show diamond stolen or thief caught
def draw_rect(color, x, y, width, height):
    pygame.draw.rect(gameDisplay, color, [x*width, y*height, width, height], 10)
    pygame.display.update()
    time.sleep(2)

total_thief_caught = 0
total_diamond_stolen = 0

epsilon, eps_decay, eps_min = 1.0, 0.99, 0.05

#number of episodes to train
num_episodes = 3000

# loop over episodes
for i_episode in range(1, num_episodes+1):
    # monitor progress
    if i_episode % 100 == 0:
        print("\rEpisode {}/{}".format(i_episode, num_episodes), end="")
        police_tmp = {}
        thief_tmp = {}
        for k in police.Q.keys():
            police_tmp[k] = police.Q[k]
        for k in thief.Q.keys():
            thief_tmp[k] = thief.Q[k]
        pickle.dump(thief_tmp, open("thief_q.p","wb"))
        pickle.dump(police_tmp, open("police_q.p","wb"))
        sys.stdout.flush() 

    epsilon = max(epsilon*eps_decay, eps_min)
    
    state = env.reset()
    action_thief = thief.greedy_action(state['thief'], epsilon)
    action_police = police.greedy_action(state['police'], epsilon)
    
    #render the environment         
    env.render(i_episode)

    while True:
        
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()   #close the window
                quit() 

        
        next_state, reward, done, info = env.step(action_thief, action_police)
        
        thief.learn(state['thief'], action_thief, reward['thief'], next_state['thief'])
        police.learn(state['police'], action_police, reward['police'], next_state['police'])
        
        gameDisplay.fill(WHITE)         
        env.render(i_episode)
        show_info(total_diamond_stolen, total_thief_caught)

        #updating the display
        pygame.display.update()
        clock.tick(100)
        
        if done:
            if info['diamond_stolen']:
                total_diamond_stolen += 1
                draw_rect(GREEN, info['x'], info['y'], info['width'], info['height'])       
            
            if info['thief_caught']:
                total_thief_caught += 1
                draw_rect(RED, info['x'], info['y'], info['width'], info['height'])       
            break
       
        #update state and action
        state = next_state
        action_thief = thief.greedy_action(state['thief'], epsilon)
        action_police = police.greedy_action(state['police'], epsilon)

police.set_policy()
thief.set_policy()

#saving the policies
police.save('_police')
thief.save('_thief')

### Testing the Model on the saved policy

In [None]:
############################################################################################################################
#                                             Testing the agents
############################################################################################################################

#loading the policy
police.change_policy('policy_police.pickle')
thief.change_policy('policy_thief.pickle')

total_thief_caught = 0
total_diamond_stolen = 0

num_episodes = 1

# loop over episodes
for i_episode in range(1, num_episodes+1):
   
    state = env.reset()
    action_thief = thief.take_action(state['thief'])
    pre_action_thief = action_thief
    action_police = police.take_action(state['police'])
    pre_action_police = action_police
    
    #render the environment         
    env.render(i_episode)

    while True:
        
        for event in pygame.event.get():
            if event.type == pygame.QUIT:
                pygame.quit()
                quit() 

        next_state, reward, done, info = env.step2(action_thief, action_police)
        
        gameDisplay.fill(WHITE)         
        env.render(i_episode)
        show_info(total_diamond_stolen, total_thief_caught)

        #updating the display
        pygame.display.update()
        clock.tick(5)
        
        if done:
            if info['diamond_stolen']:
                total_diamond_stolen += 1
                draw_rect(GREEN, info['x'], info['y'], info['width'], info['height'])       
            
            if info['thief_caught']:
                total_thief_caught += 1
                draw_rect(RED, info['x'], info['y'], info['width'], info['height'])    
                
            break
       
        #update state and action
        state = next_state
        action_thief = thief.take_action(state['thief'])
        action_police = police.take_action(state['police'])
        
        
        # if two actions have same reward choosing a random action from the policy
        
        if (pre_action_thief == (action_thief + 1)) or (pre_action_thief == (action_thief - 1)):
            action_thief = np.random.choice(4)
        if (pre_action_police == (action_police + 1)) or (pre_action_police == (action_police - 1)):
            action_police = np.random.choice(4)
        
        
time.sleep(2)
pygame.quit()