<a href="https://colab.research.google.com/github/rzil-1/PixelMon/blob/main/Pixelmon1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import random
import json
from gym import Env
from gym.spaces import Discrete, Box

# ===============================
# Environment with Pokémon Switching
# ===============================
class EnhancedPokemonBattleEnv(Env):
    def __init__(self, pokemon_data_path="/content/pokemon_trial.json"):
        super(EnhancedPokemonBattleEnv, self).__init__()

        # Load Pokémon data (each with type and moves)
        self.pokemon_data = self.load_pokemon_data(pokemon_data_path)

        # Type effectiveness chart (customize as needed)
        self.type_chart = {
            "Fire": {"Grass": 2.0, "Water": 0.5, "Fire": 0.5},
            "Water": {"Fire": 2.0, "Grass": 0.5, "Water": 0.5},
            "Grass": {"Water": 2.0, "Fire": 0.5, "Grass": 0.5},
            "Electric": {"Water": 2.0, "Grass": 0.5, "Electric": 0.5},
            "Normal": {},
            "Dark": {"Psychic": 2.0, "Fighting": 0.5},
            "Dragon": {"Dragon": 2.0}
            # Add other types as needed.
        }

        # Action space: 0-3 use a move; 4-7 switch to bench slot (0-indexed)
        self.action_space = Discrete(8)
        # Observation: [agent_active_hp, opponent_active_hp, last_agent_damage, last_opponent_damage, agent_remaining]
        self.observation_space = Box(
            low=np.array([0, 0, 0, 0, 1]),
            high=np.array([500, 500, 100, 100, 4]),
            dtype=np.float32
        )

        self.max_steps = 30
        self.reset()

    def load_pokemon_data(self, path):
        with open(path, 'r') as f:
            return json.load(f)

    def get_type_multiplier(self, move_type, target_type):
        return self.type_chart.get(move_type, {}).get(target_type, 1.0)

    def reset(self):
        self.steps = 0
        self.last_agent_damage = 0
        self.last_opponent_damage = 0

        # --- For the Agent ---
        # Select 4 random Pokémon for the agent.
        self.agent_bench = []
        agent_keys = random.sample(list(self.pokemon_data.keys()), 4)
        for key in agent_keys:
            pkmn = {
                "name": key,
                "type": self.pokemon_data[key]["type"],
                "moves": list(self.pokemon_data[key]["moves"].values()),
                "hp": 500
            }
            self.agent_bench.append(pkmn)
        # Choose one active Pokémon (randomly)
        self.agent_active_idx = random.randint(0, 3)

        # --- For the Opponent ---
        # Select 4 random Pokémon for the opponent.
        self.opponent_bench = []
        opponent_keys = random.sample(list(self.pokemon_data.keys()), 4)
        for key in opponent_keys:
            pkmn = {
                "name": key,
                "type": self.pokemon_data[key]["type"],
                "moves": list(self.pokemon_data[key]["moves"].values()),
                "hp": 500
            }
            self.opponent_bench.append(pkmn)
        # Choose one active Pokémon for the opponent (randomly)
        self.opponent_active_idx = random.randint(0, 3)

        return self._get_state()

    def _get_state(self):
        # Count how many agent Pokémon are still alive (hp > 0)
        agent_remaining = sum(1 for p in self.agent_bench if p["hp"] > 0)
        return np.array([
            self.agent_bench[self.agent_active_idx]["hp"],
            self.opponent_bench[self.opponent_active_idx]["hp"],
            self.last_agent_damage,
            self.last_opponent_damage,
            agent_remaining
        ], dtype=np.float32)

    def step(self, action):
        self.steps += 1
        reward = 0

        # --- Agent Turn ---
        # If action is a switching action:
        if action >= 4:
            new_idx = action - 4
            # If already active, do nothing (and impose a small penalty)
            if new_idx == self.agent_active_idx:
                reward -= 2  # penalty for wasted switch
            else:
                # Switch active Pokémon (if the target is still alive)
                if self.agent_bench[new_idx]["hp"] > 0:
                    self.agent_active_idx = new_idx
                    reward -= 1  # small penalty to discourage unnecessary switches
                else:
                    # If the chosen Pokémon is fainted, no effect (and penalty)
                    reward -= 5
            # When switching, no attack is performed.
            agent_damage = 0
        else:
            # Move action: use selected move from active Pokémon.
            active_agent = self.agent_bench[self.agent_active_idx]
            # If the active Pokémon is fainted, force a switch (no move action)
            if active_agent["hp"] <= 0:
                agent_damage = 0
            else:
                # Ensure move index is in range (the Pokémon has 4 moves)
                move_idx = action
                if move_idx >= len(active_agent["moves"]):
                    move_idx = 0  # fallback
                move = active_agent["moves"][move_idx]
                base_damage = move["damage"]
                multiplier = self.get_type_multiplier(move["type"], self.opponent_bench[self.opponent_active_idx]["type"])
                agent_damage = base_damage * multiplier
                # Subtract damage from opponent's active Pokémon
                self.opponent_bench[self.opponent_active_idx]["hp"] -= agent_damage

        self.last_agent_damage = agent_damage

        # --- Opponent Turn ---
        # Check: if opponent active is fainted, force a switch.
        if self.opponent_bench[self.opponent_active_idx]["hp"] <= 0:
            available = [i for i, p in enumerate(self.opponent_bench) if p["hp"] > 0]
            if available:
                self.opponent_active_idx = random.choice(available)
        # Now, if an opponent is available, it attacks.
        if self.opponent_bench[self.opponent_active_idx]["hp"] > 0:
            opp_active = self.opponent_bench[self.opponent_active_idx]
            opp_move = random.choice(opp_active["moves"])
            opp_base_damage = opp_move["damage"]
            opp_multiplier = self.get_type_multiplier(opp_move["type"], self.agent_bench[self.agent_active_idx]["type"])
            opponent_damage = opp_base_damage * opp_multiplier
            self.agent_bench[self.agent_active_idx]["hp"] -= opponent_damage
        else:
            opponent_damage = 0

        self.last_opponent_damage = opponent_damage

        # If the agent's active Pokémon fainted due to the opponent's attack, force a switch:
        if self.agent_bench[self.agent_active_idx]["hp"] <= 0:
            available = [i for i, p in enumerate(self.agent_bench) if p["hp"] > 0]
            if available:
                self.agent_active_idx = random.choice(available)

        # --- Reward Calculation ---
        # Base reward: reward damage dealt
        reward += 5 * agent_damage
        # Bonus if agent's active HP is higher than opponent's
        hp_diff = self.agent_bench[self.agent_active_idx]["hp"] - self.opponent_bench[self.opponent_active_idx]["hp"]
        reward += 0.1 * (hp_diff / 500)
        # Penalize if no damage is done when an attack was attempted.
        if action < 4 and agent_damage == 0:
            reward -= 5

        # Outcome rewards:
        # If all opponent Pokémon have fainted, win bonus.
        if all(p["hp"] <= 0 for p in self.opponent_bench):
            hp_bonus = self.agent_bench[self.agent_active_idx]["hp"] / 500
            reward += 50 + (20 * hp_bonus)
        # If all agent Pokémon fainted, lose penalty.
        if all(p["hp"] <= 0 for p in self.agent_bench):
            reward -= 50
        # If max steps reached, reward or penalty based on HP advantage.
        if self.steps >= self.max_steps:
            agent_total = sum(p["hp"] for p in self.agent_bench if p["hp"] > 0)
            opp_total = sum(p["hp"] for p in self.opponent_bench if p["hp"] > 0)
            if agent_total > opp_total:
                reward += 10 * (agent_total - opp_total) / 500
            else:
                reward -= 10

        # Determine if battle is over.
        done = (all(p["hp"] <= 0 for p in self.agent_bench) or
                all(p["hp"] <= 0 for p in self.opponent_bench) or
                self.steps >= self.max_steps)

        return self._get_state(), reward, done, {}

    def render(self, mode='human'):
        print(f"Step: {self.steps}")
        print(f"Agent Active: {self.agent_bench[self.agent_active_idx]['name']} [{self.agent_bench[self.agent_active_idx]['hp']:.1f} HP]")
        print(f"Opponent Active: {self.opponent_bench[self.opponent_active_idx]['name']} [{self.opponent_bench[self.opponent_active_idx]['hp']:.1f} HP]")
        print("-" * 40)


# ===============================
# State Discretization Function (for Q-Learning)
# ===============================
def discretize_state(state):
    """
    State vector:
      [agent_active_hp, opponent_active_hp, last_agent_damage, last_opponent_damage, agent_remaining]

    We use predefined bins for HP and damage; agent_remaining is an integer (1 to 4).
    """
    # Define bins for HP and damage
    hp_bins = np.array([0, 100, 200, 300, 400, 500])
    damage_bins = np.array([0, 20, 40, 60, 80, 100])

    # Discretize agent active HP: cap maximum index at len(hp_bins)-3 so the max index is 3.
    hp1_idx = np.digitize(state[0], hp_bins) - 1
    hp1_idx = min(hp1_idx, len(hp_bins) - 3)

    # Discretize opponent active HP
    hp2_idx = np.digitize(state[1], hp_bins) - 1
    hp2_idx = min(hp2_idx, len(hp_bins) - 3)

    # Discretize last agent damage
    dmg1_idx = np.digitize(state[2], damage_bins) - 1
    dmg1_idx = min(dmg1_idx, len(damage_bins) - 3)

    # Discretize last opponent damage
    dmg2_idx = np.digitize(state[3], damage_bins) - 1
    dmg2_idx = min(dmg2_idx, len(damage_bins) - 3)

    # Agent remaining: values from 1 to 4 → subtract 1 to get an index from 0 to 3.
    remain_idx = int(state[4]) - 1
    remain_idx = min(max(remain_idx, 0), 3)

    return (hp1_idx, hp2_idx, dmg1_idx, dmg2_idx, remain_idx)


# ===============================
# Q-Learning Training Parameters
# ===============================
# For our discretization:
hp_bins_count = 4       # valid indices 0 to 3
damage_bins_count = 4   # valid indices 0 to 3
remain_bins_count = 4   # for agent_remaining (1-4 mapped to 0-3)

num_actions = 8  # 4 moves + 4 switching actions
epsilon = 1.0
epsilon_min = 0.05
epsilon_decay = 0.9997
alpha = 0.1
gamma = 0.95

# Q-table shape: (hp1, hp2, dmg1, dmg2, remain, action)
q_table = np.zeros((hp_bins_count, hp_bins_count, damage_bins_count, damage_bins_count, remain_bins_count, num_actions))

env = EnhancedPokemonBattleEnv("/content/pokemon_trial.json")
num_episodes = 25000
recent_rewards = []

for episode in range(num_episodes):
    state = env.reset()
    total_reward = 0
    done = False

    while not done:
        state_idx = discretize_state(state)

        # Epsilon-greedy action selection.
        if random.uniform(0, 1) < epsilon:
            action = env.action_space.sample()
        else:
            action = np.argmax(q_table[state_idx])

        next_state, reward, done, _ = env.step(action)
        next_state_idx = discretize_state(next_state)

        # Q-learning update.
        old_value = q_table[state_idx][action]
        next_max = np.max(q_table[next_state_idx])
        new_value = (1 - alpha) * old_value + alpha * (reward + gamma * next_max)
        q_table[state_idx][action] = new_value

        state = next_state
        total_reward += reward

    recent_rewards.append(total_reward)
    if len(recent_rewards) > 100:
        recent_rewards.pop(0)

    epsilon = max(epsilon_min, epsilon * epsilon_decay)

    if (episode + 1) % 1000 == 0:
        avg_reward = np.mean(recent_rewards)
        print(f"Episode: {episode + 1}, Avg Reward: {avg_reward:.2f}, Epsilon: {epsilon:.3f}")

# ===============================
# Test the Trained Agent
# ===============================
test_episodes = 10
test_wins = 0

for episode in range(test_episodes):
    state = env.reset()
    done = False
    total_reward = 0

    while not done:
        state_idx = discretize_state(state)
        action = np.argmax(q_table[state_idx])
        state, reward, done, _ = env.step(action)
        total_reward += reward
        env.render()

    if reward > 0:
        test_wins += 1
    print(f"Test Episode {episode + 1} finished with total reward: {total_reward:.2f}")

print(f"Test Win Rate: {(test_wins/test_episodes)*100:.2f}%")

  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


Episode: 1000, Avg Reward: 4309.16, Epsilon: 0.741
Episode: 2000, Avg Reward: 5038.09, Epsilon: 0.549
Episode: 3000, Avg Reward: 5561.64, Epsilon: 0.407
Episode: 4000, Avg Reward: 6014.54, Epsilon: 0.301
Episode: 5000, Avg Reward: 6219.21, Epsilon: 0.223
Episode: 6000, Avg Reward: 6537.82, Epsilon: 0.165
Episode: 7000, Avg Reward: 6689.78, Epsilon: 0.122
Episode: 8000, Avg Reward: 6953.07, Epsilon: 0.091
Episode: 9000, Avg Reward: 6773.59, Epsilon: 0.067
Episode: 10000, Avg Reward: 6998.76, Epsilon: 0.050
Episode: 11000, Avg Reward: 6799.52, Epsilon: 0.050
Episode: 12000, Avg Reward: 6799.23, Epsilon: 0.050
Episode: 13000, Avg Reward: 7293.25, Epsilon: 0.050
Episode: 14000, Avg Reward: 6850.44, Epsilon: 0.050
Episode: 15000, Avg Reward: 6812.73, Epsilon: 0.050
Episode: 16000, Avg Reward: 6813.41, Epsilon: 0.050
Episode: 17000, Avg Reward: 7019.67, Epsilon: 0.050
Episode: 18000, Avg Reward: 6933.10, Epsilon: 0.050
Episode: 19000, Avg Reward: 6729.47, Epsilon: 0.050
Episode: 20000, Avg R

NameError: name 'e' is not defined