In [1]:
import pygame

import random
from collections import deque
import numpy as np
from helper import *

pygame 2.6.1 (SDL 2.28.4, Python 3.10.12)
Hello from the pygame community. https://www.pygame.org/contribute.html


In [2]:
# Parameters
grid_size = 5  # Grid size for player and enemies (positions range from 0 to 4)
num_enemies = 4  # Number of enemies
visited_states = 2  # Visited status can be 0 or 1
wall_states = 2  # Wall status can be 0 (no wall) or 1 (wall)

movements = np.array([
    [0, 1],   # Move up
    [1, 0],   # Move right
    [-1, 0],  # Move left
    [0, -1]   # Move down
])

# Generate walls: 0 for no wall, 1 for wall

state_action_log = []


In [3]:
base_shape = [ visited_states, wall_states]
enemy_shape = [grid_size, grid_size] * num_enemies  # Each enemy has x and y positions
full_shape = base_shape + enemy_shape

# # Generate estimated_value_grid filled with zeros
estimated_value_grid = np.zeros(full_shape, dtype=float)

# # Generate random indices into the movements array
policy_indices = np.random.randint(0, 4, size=full_shape)

# # Create the policy array by indexing into movements
policy = movements[policy_indices]

# # Now, policy has shape full_shape + (2,), where the last dimension stores (dx, dy)
print("Policy Shape:", policy.shape)  # For verification

# # Generate reward_states: 40x40 grid of random 1s and 2s
reward_states = np.random.choice([1, 2], size=(grid_size, grid_size))

# # Output shapes for verification
print("Estimated Value Grid Shape:", estimated_value_grid.shape)
print("Reward States Shape:", reward_states.shape)

Policy Shape: (2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 2)
Estimated Value Grid Shape: (2, 2, 5, 5, 5, 5, 5, 5, 5, 5)
Reward States Shape: (5, 5)


In [4]:
policy = {}
estimated_value_grid={}

In [5]:
def partition_maze_optimized(maze, s):
    x, y = s
    rows, cols = len(maze), len(maze[0])
    half_grid = 5 // 2

    partition = [
        [
            maze[(x + dx) % rows][(y + dy) % cols]
            for dy in range(-half_grid, half_grid + 1)
        ]
        for dx in range(-half_grid, half_grid + 1)
    ]
    return np.array(partition)

In [6]:
def optimized_relative_positions(maze_size, player_pos, enemy_positions, grid_size=5):
    x, y = player_pos
    rows, cols = maze_size  # Maze dimensions
    half_grid = grid_size // 2

    relative_positions = []

    for ex, ey in enemy_positions:
        # Compute differences considering wrap-around
        dx = (ex - x + cols) % cols
        if dx > cols // 2:
            dx -= cols

        dy = (ey - y + rows) % rows
        if dy > rows // 2:
            dy -= rows

        # Check if enemy is within the local grid
        if -half_grid <= dx <= half_grid and -half_grid <= dy <= half_grid:
            # Map to local grid coordinates (0 to 4)
            local_x = int(dx + half_grid)
            local_y = int(dy + half_grid)
            relative_positions.append((local_x, local_y))
    return relative_positions

In [7]:
def check_collision(current_state, enemies):
    x, y = current_state
    for enemy in enemies:
        ex, ey = enemy["pos"]
        if (x, y) == (ex, ey):
            return True
    return False


In [8]:
def is_wall_ahead(current_state, action, reward_maze):
    x, y = current_state
    dx, dy = action
    maze_rows, maze_cols = len(reward_maze),len(reward_maze[0])
    nextx, nexty = (x + dx) % maze_rows, (y + dy) % maze_cols
    return reward_maze[nextx][nexty] == 1  # Returns True if the next cell is a wall


In [9]:
def get_valid_actions(current_state, reward_maze):
    x, y = current_state
    maze_rows, maze_cols = len(reward_maze),len(reward_maze[0])
    valid_actions = []
    for dx, dy in movements:
        nextx, nexty = (x + dx) % maze_rows, (y + dy) % maze_cols
        if reward_maze[nextx][nexty] != 1:  # Not a wall
            valid_actions.append((dx, dy))
    return valid_actions


In [10]:
def update_policy_estimated_vals(epsilon,old_indices_s,policy, prev_player_pos,estimated_value_grid, og_enemy_positions, reward_maze,pack):
    move_values=[(1,0),(-1,0),(0,-1),(0,1)]
    maze_rows, maze_cols = len(reward_maze),len(reward_maze[0])
    x,y=prev_player_pos
    e1=optimized_relative_positions(
        (maze_rows, maze_cols), ((x+1)%maze_rows,y), og_enemy_positions, grid_size=grid_size
    )
    f1 = int(reward_maze[(x+1)%maze_rows][y] == 2)
    w1 = int(reward_maze[(x+1)%maze_rows][y] == 1)
    max_enemies_in_local_grid = len(e1)
    e1.sort()
    padded_enemy_positions = e1[:max_enemies_in_local_grid]
    num_missing = max_enemies_in_local_grid - len(padded_enemy_positions)
    padded_enemy_positions.extend([(-1, -1)] * num_missing)
    enemy_positions_flat = [coord for pos in padded_enemy_positions for coord in pos]
    indices_1 = (f1, w1) + tuple(enemy_positions_flat)
    e2=optimized_relative_positions(
        (maze_rows, maze_cols), ((x-1)%maze_rows,y), og_enemy_positions, grid_size=grid_size
    )
    f2 = int(reward_maze[(x-1)%maze_rows][y] == 2)
    w2 = int(reward_maze[(x-1)%maze_rows][y] == 1)
    max_enemies_in_local_grid = len(e2)
    e2.sort()
    padded_enemy_positions = e2[:max_enemies_in_local_grid]
    num_missing = max_enemies_in_local_grid - len(padded_enemy_positions)
    padded_enemy_positions.extend([(-1, -1)] * num_missing)
    enemy_positions_flat = [coord for pos in padded_enemy_positions for coord in pos]
    indices_2 = (f2, w2) + tuple(enemy_positions_flat)
    e3=optimized_relative_positions(
        (maze_rows, maze_cols), (x,(y+1)%maze_rows), og_enemy_positions, grid_size=grid_size
    )
    f3 = int(reward_maze[x][(y+1)%maze_rows] == 2)
    w3 = int(reward_maze[x][(y+1)%maze_rows] == 1)
    max_enemies_in_local_grid = len(e3)
    e3.sort()
    padded_enemy_positions = e3[:max_enemies_in_local_grid]
    num_missing = max_enemies_in_local_grid - len(padded_enemy_positions)
    padded_enemy_positions.extend([(-1, -1)] * num_missing)
    enemy_positions_flat = [coord for pos in padded_enemy_positions for coord in pos]
    indices_3 = (f3, w3) + tuple(enemy_positions_flat)
    e4=optimized_relative_positions(
        (maze_rows, maze_cols), (x,(y-1)%maze_rows), og_enemy_positions, grid_size=grid_size
    )
    f4 = int(reward_maze[x][(y-1)%maze_rows] == 2)
    w4 = int(reward_maze[x][(y-1)%maze_rows] == 1)
    max_enemies_in_local_grid = len(e4)
    e4.sort()
    padded_enemy_positions = e4[:max_enemies_in_local_grid]
    num_missing = max_enemies_in_local_grid - len(padded_enemy_positions)
    padded_enemy_positions.extend([(-1, -1)] * num_missing)
    enemy_positions_flat = [coord for pos in padded_enemy_positions for coord in pos]
    indices_4 = (f4, w4) + tuple(enemy_positions_flat)
    val_1 = estimated_value_grid.get(indices_1, 0)
    val_2 = estimated_value_grid.get(indices_2, 0)
    val_3 = estimated_value_grid.get(indices_3, 0)
    val_4 = estimated_value_grid.get(indices_4, 0)

    moves = [
        (val_1, (1, 0)),   
        (val_2, (-1, 0)),  
        (val_3, (0, 1)),   
        (val_4, (0, -1)),  
    ]
    QSA,learning_rate,discount,R1 =pack
    max_future_Q = max(
            val_1,val_2,val_3,val_4
    )
    estimated_value_grid[old_indices_s] = QSA + learning_rate * (R1 + discount * max_future_Q - QSA)
    # Pick the mov
    # e with the greatest value
    best_val, best_move = max(moves, key=lambda x: x[0])
    if random.random() < epsilon:
        # Exploration: Choose a random move
        best_move = random.choice(move_values)
    else:
        best_val, best_move = max(moves, key=lambda x: x[0])

    policy[old_indices_s]=best_move

    return policy,estimated_value_grid


In [11]:
def Q_Policy(policy, prev_player_pos,new_player_pos,estimated_value_grid, enemies, reward_maze, visited_empty_spaces,grid_size=5,learning_rate=0.1, discount=0.9,epsilon=0.1):
    maze_rows, maze_cols = len(reward_maze),len(reward_maze[0])
    og_enemy_positions = [enemy["pos"] for enemy in enemies]
    new_elative_enemy_positions = optimized_relative_positions(
        (maze_rows, maze_cols), new_player_pos, og_enemy_positions, grid_size=grid_size
    )
    old_relative_enemy_positions = optimized_relative_positions(
        (maze_rows, maze_cols), prev_player_pos, og_enemy_positions, grid_size=grid_size
    )
    old_food = int(reward_maze[prev_player_pos[0]][prev_player_pos[1]] == 2)
    old_wall = int(reward_maze[prev_player_pos[0]][prev_player_pos[1]] == 1)

    new_food = int(reward_maze[new_player_pos[0]][new_player_pos[1]] == 2)
    new_wall = int(reward_maze[new_player_pos[0]][new_player_pos[1]] == 1)

    max_enemies_in_local_grid = len(old_relative_enemy_positions)
    old_relative_enemy_positions.sort()
    old_padded_enemy_positions = old_relative_enemy_positions[:max_enemies_in_local_grid]
    num_missing = max_enemies_in_local_grid - len(old_padded_enemy_positions)
    old_padded_enemy_positions.extend([(-1, -1)] * num_missing) 
    old_enemy_positions_flat = [coord for pos in old_padded_enemy_positions for coord in pos]


    max_enemies_in_local_grid = len(new_elative_enemy_positions)
    new_elative_enemy_positions.sort()
    new_padded_enemy_positions = new_elative_enemy_positions[:max_enemies_in_local_grid]
    num_missing = max_enemies_in_local_grid - len(new_padded_enemy_positions)
    new_padded_enemy_positions.extend([(-1, -1)] * num_missing)  
    new_enemy_positions_flat = [coord for pos in new_padded_enemy_positions for coord in pos]


    old_indices_s = (old_food, old_wall) + tuple(old_enemy_positions_flat)
    new_indices_s = (new_food, new_wall) + tuple(new_enemy_positions_flat)
    
    QS1A1=estimated_value_grid.get(new_indices_s)
    if check_collision(new_player_pos, enemies):
        R1 = -99999 # Collision with enemy
        print("Collision Detected")
        visited_empty_spaces=0
    elif new_wall:
        visited_empty_spaces=0
        R1 = -10 # Attempted to move into a wall
    elif new_food:  # Food
        visited_empty_spaces=0
        R1 = 20
    else:
        R1 = -10 # Default penalty for empty space
        visited_empty_spaces+=1
    QSA=estimated_value_grid.get(old_indices_s)
    if QSA is None:
        QSA=0
    if QS1A1 is None:
        QS1A1=0
    print(QSA,learning_rate,R1,discount,QS1A1)
    policy,estimated_value_grid=update_policy_estimated_vals(epsilon,old_indices_s,policy, prev_player_pos,estimated_value_grid, og_enemy_positions, reward_maze,(QSA,learning_rate,discount,R1))
    return estimated_value_grid,policy,visited_empty_spaces
    

In [12]:
def initialize_positions(maze, num_enemies):
    maze_rows, maze_cols = len(maze), len(maze[0])

    # Find all possible positions (excluding walls)
    possible_positions = [(x, y) for x in range(maze_rows) for y in range(maze_cols) if maze[x][y] != 1]

    # Randomly select a position for the player
    player_pos = random.choice(possible_positions)

    # Remove player's position from possible positions
    possible_positions.remove(player_pos)

    enemies = []
    for _ in range(num_enemies):
        if not possible_positions:
            break  # No more positions available
        enemy_pos = random.choice(possible_positions)
        enemies.append({"pos": enemy_pos, "target": None})
        possible_positions.remove(enemy_pos)

    return player_pos, enemies


In [13]:
maxt=50

In [14]:
class Game:
    def __init__(self, maze, player_pos, enemies, score=0, timeout=maxt):
        self.maze = maze
        self.player_pos = player_pos
        self.enemies = enemies
        self.score = score
        self.timeout = timeout
        self.state_action_log = []
        self.running = True  # Indicates if the game is still running

    def restart_game(self):
        self.maze = create_maze(ROWS, COLS)
        self.player_pos, self.enemies = initialize_positions(self.maze, num_enemies)
        self.score = 0
        self.timeout = maxt
        self.state_action_log = []
        self.running = True
        self.visited_empty_spaces=0


In [52]:
def calculate_reward(maze, player_pos, action):
    x, y = player_pos
    dx, dy = action

    # Calculate the new position
    new_x, new_y = x + dx, y + dy

    # Check bounds
    if new_x < 0 or new_x >= maze.shape[0] or new_y < 0 or new_y >= maze.shape[1]:
        return -4, False, (x, y)  # Invalid move into a wall (out of bounds)

    # Check the cell value
    cell_value = maze[new_x, new_y]

    if cell_value == 2:  # Food
        reward = 1
    elif cell_value == 1:  # Wall
        reward = -4
        return reward, False, (x, y)  # Invalid move
    elif cell_value == 4:  # Enemy
        reward = -100  # Game over penalty
        return reward, False, (x, y)  # Invalid move (optional: game end logic)
    else:  # Empty space
        reward = 0.3

    # If valid move, update the player's position
    return reward, True, (new_x, new_y)


In [45]:
import torch
import torch.nn as nn

class SimpleModel(nn.Module):
    def __init__(self, input_size=64):
        super(SimpleModel, self).__init__()    
        self.layer = nn.Linear(input_size, 10)  # `input_size` must match flattened maze size

    def forward(self, x):
        return self.layer(x)


In [40]:
maze = create_maze(ROWS, COLS)
player_pos, enemies = initialize_positions(maze, num_enemies)
game = Game(maze, player_pos, enemies, score=0, timeout=maxt)
x,y =player_pos
maze[x][y]=3
for enemy in enemies:
    x,y=(enemy['pos'])
    maze[x][y]=4   
maze=np.array(maze)

In [62]:
def compute_q_loss(model, target_model, maze, reward, action, next_maze, done, gamma=0.99):
    # Flatten current and next state maze tensors
    maze_flat = maze.view(1, -1)
    next_maze_flat = next_maze.view(1, -1)

    # Predict Q-values for the current state and the next state
    q_values = model(maze_flat)  # Shape: [1, num_actions]
    with torch.no_grad():
        next_q_values = target_model(next_maze_flat)  # Shape: [1, num_actions]

    # Map tuple action to an integer index
    # Example: Define a mapping for actions
    action_map = {
        (0, 0): 0,  # Example mapping: "up" -> 0
        (1, 0): 1,  # "down" -> 1
        (0, -1): 2, # "left" -> 2
        (0, 1): 3   # "right" -> 3
    }
    action_index = action_map[action]

    # Get the Q-value for the specific action
    q_value = q_values.gather(1, torch.tensor([[action_index]], dtype=torch.long)).squeeze(-1)  # Shape: [1]

    # Compute the target Q-value
    max_next_q_value = next_q_values.max(1)[0]  # Shape: [1] (Max Q-value for the next state)
    target_q_value = torch.tensor([reward + gamma * max_next_q_value.item() * (1 - done)], dtype=torch.float32)  # Shape: [1]

    # Compute Q-Loss
    loss_fn = nn.MSELoss()
    loss = loss_fn(q_value, target_q_value)

    return loss


In [63]:
maze = torch.randn(ROWS, COLS)  # Random example maze
next_maze = torch.randn(ROWS, COLS)  # Example next state maze


In [64]:
model = SimpleModel(input_size=ROWS * COLS)
target_model = SimpleModel(input_size=ROWS * COLS)
target_model.load_state_dict(model.state_dict()) 
done=0
action = (0,0)
reward, valid_move, new_player_pos = calculate_reward(maze, player_pos, action)

# Compute Q-Loss
loss = compute_q_loss(model, target_model, maze, reward, action, next_maze, done)


In [None]:
pygame.init()
print("RRUN")
global policy, maxt,estimated_value_grid
maxt = 50
num_games = 1  # Number of games to run
games = []
for _ in range(num_games):
    maze = create_maze(ROWS, COLS)
    player_pos, enemies = initialize_positions(maze, num_enemies)
    game = Game(maze, player_pos, enemies, score=0, timeout=maxt)
    x,y =player_pos
    maze[x][y]=3
    for enemy in enemies:
        x,y=(enemy['pos'])
        maze[x][y]=4    
    games.append(game)

clock = pygame.time.Clock()
running = True
while running:
    # Handle events (e.g., quitting the game)
    for event in pygame.event.get():
        if event.type == pygame.QUIT:
            running = False

    # For each game instance, update its state
    all_games_ready_for_update = True
    for game in games:
        if not game.running:
            continue  # Skip if game is not running
        
        if game.timeout == 0:
            # Check if all games have reached the timeout
            pass  # We'll handle synchronization after this loop
        else:
            all_games_ready_for_update = False
            current_state = game.player_pos
            action = 
            game.timeout -= 1
            new_player_pos = move_entity(game.player_pos, action)
            move_enemies(game.enemies, game.maze)
            if is_valid_move(new_player_pos, game.maze):
                if game.maze[new_player_pos[0]][new_player_pos[1]] == 2:  # Collect food
                    game.score += 1
                    game.maze[new_player_pos[0]][new_player_pos[1]] = 0
                game.player_pos = new_player_pos
            
            # Check for collisions
            for enemy in game.enemies:
                if enemy["pos"] == game.player_pos or check_collision(game.player_pos, game.enemies):
                    print("Game Over! Restarting...")
                    game.running = False  # Mark the game for update

            # Check if all food is collected
            if all(game.maze[row][col] != 2 for row in range(ROWS) for col in range(COLS)):
                print("You Win! Restarting...")
                game.running = False  # Mark the game for update

            

    # Synchronize updates
    if all_games_ready_for_update or all(not game.running for game in games):
        # Run Next_Cycle for each game
        for game in games:
            game.restart_game()
            game.running = True  # Reset the running flag
        if maxt < 10000:
            maxt += 10

    # Draw all games
    draw_all_games(games)

    pygame.display.flip()
    clock.tick(FPS)
pygame.quit()


RRUN


TypeError: 'int' object is not subscriptable