In [9]:
import os
import shutil
import numpy as np
from typing import Any, List, Dict

# --- Copy collision meshes ---
source_path = "/kaggle/input/collmesh/collision_meshes"
destination_path = "./collision_meshes"

if not os.path.exists(destination_path):
    print(f"Copying collision meshes from '{source_path}' to '{destination_path}'...")
    shutil.copytree(source_path, destination_path)
    print("Collision meshes copied successfully.")
else:
    print("Collision meshes already exist in the correct location.")

# --- modern_coyote.py content ---
modern_coyote_code = r"""
import numpy as np
from typing import Any, List, Tuple
from rlgym_sim.utils import common_values
from rlgym_sim.utils.gamestates import PlayerData, GameState
from rlgym_sim.utils.obs_builders import ObsBuilder
from rlgym_sim.utils.action_parsers import ActionParser
from gym.spaces import Discrete, Box

# -------------------- ACTION PARSER --------------------
class ModernCoyoteAction(ActionParser):
    def __init__(self, version=None):
        super().__init__()
        self._lookup_table = self.make_lookup_table(version)

    def get_action_space(self) -> Discrete:
        return Discrete(len(self._lookup_table))

    def parse_actions(self, actions: Any, state: GameState) -> np.ndarray:
        parsed_actions = []
        for action in actions:
            raw_action = action[0] if isinstance(action, np.ndarray) and action.ndim > 0 else action
            action_index = int(raw_action)
            parsed_actions.append(self._lookup_table[action_index])
        return np.array(parsed_actions)

    @staticmethod
    def make_lookup_table(version=None):
        actions = []
        if version is None or version == "Normal":
            for throttle in (-1, 0, 0.5, 1):
                for steer in (-1, -0.5, 0, 0.5, 1):
                    for boost in (0, 1):
                        for handbrake in (0, 1):
                            if boost == 1 and throttle != 1: continue
                            actions.append([throttle or boost, steer, 0, steer, 0, 0, boost, handbrake])
            for pitch in (-1, -0.75, -0.5, 0, 0.5, 0.75, 1):
                for yaw in (-1, -0.75, -0.5, 0, 0.5, 0.75, 1):
                    for roll in (-1, 0, 1):
                        for jump in (0, 1):
                            for boost in (0, 1):
                                if jump == 1 and yaw != 0: continue
                                if pitch == roll == jump == 0: continue
                                handbrake = jump == 1 and (pitch != 0 or yaw != 0 or roll != 0)
                                actions.append([boost, yaw, pitch, yaw, roll, jump, boost, handbrake])
            actions.append([0, 1, 0, 0, -1, 1, 0, 0])
            actions = np.array(actions, dtype=np.float32)
        else:
            raise NotImplementedError(f"Version '{version}' not implemented.")
        return actions

class ModernCoyoteObsBuilder(ObsBuilder):
    def __init__(self, team_size=1, tick_skip=8):
        super().__init__()
        self.team_size = team_size
        self.tick_skip = tick_skip
        self.POS_STD, self.VEL_STD, self.ANG_STD = 2300, 2300, 5.5
        self.boost_pad_timers = None
        self.demo_timers = None
        self.prev_boost_pads = None
        self.is_big_pad = np.array([loc[2] > 72 for loc in common_values.BOOST_LOCATIONS])
        self.car_id_to_idx = {}
        self.dummy_player_obs = [0] * 36

    def get_obs_space(self) -> Box:
        return Box(low=-np.inf, high=np.inf, shape=(116,), dtype=np.float32)

    def reset(self, initial_state: GameState):
        self.prev_boost_pads = np.copy(initial_state.boost_pads)
        self.boost_pad_timers = np.zeros(len(common_values.BOOST_LOCATIONS))
        self.demo_timers = np.zeros(len(initial_state.players))
        self.car_id_to_idx = {p.car_id: i for i, p in enumerate(initial_state.players)}

    def pre_step(self, state: GameState):
        time_delta = self.tick_skip / 120.0
        newly_collected = (self.prev_boost_pads == 1) & (state.boost_pads == 0)
        self.boost_pad_timers[newly_collected & self.is_big_pad] = 10.0
        self.boost_pad_timers[newly_collected & ~self.is_big_pad] = 4.0
        self.boost_pad_timers = np.maximum(0, self.boost_pad_timers - time_delta)
        self.prev_boost_pads = np.copy(state.boost_pads)
        self.car_id_to_idx = {p.car_id: i for i, p in enumerate(state.players)}
        for i, player in enumerate(state.players):
            self.demo_timers[i] = 3.0 if player.is_demoed else max(0.0, self.demo_timers[i] - time_delta)

    def build_obs(self, player: PlayerData, state: GameState, previous_action: np.ndarray) -> np.ndarray:
        if player.team_num == common_values.ORANGE_TEAM:
            own_car_physics, ball, boost_timers = player.inverted_car_data, state.inverted_ball, self.boost_pad_timers[::-1]
        else:
            own_car_physics, ball, boost_timers = player.car_data, state.ball, self.boost_pad_timers
        player_idx = self.car_id_to_idx.get(player.car_id)
        pos_diff, vel_diff = ball.position - own_car_physics.position, ball.linear_velocity - own_car_physics.linear_velocity
        player_obs = [*own_car_physics.position/self.POS_STD, *own_car_physics.linear_velocity/self.VEL_STD, *own_car_physics.angular_velocity/self.ANG_STD, *pos_diff/self.POS_STD, *vel_diff/self.VEL_STD, *own_car_physics.forward(), *own_car_physics.up(), np.linalg.norm(own_car_physics.linear_velocity)/self.VEL_STD, player.boost_amount, float(player.on_ground), float(player.has_flip), float(player.is_demoed), float(player.on_ground or player.has_jump), self.demo_timers[player_idx]/3.0 if player_idx is not None else 0.0, *previous_action]
        ball_obs = [*ball.position/self.POS_STD, *ball.linear_velocity/self.VEL_STD, *ball.angular_velocity/self.ANG_STD, np.linalg.norm(ball.linear_velocity)/self.VEL_STD, float(ball.position[2] <= 100)]
        allies, opponents = [], []
        for other_player in state.players:
            if other_player.car_id == player.car_id: continue
            target_list, is_teammate = (allies, 1) if other_player.team_num == player.team_num else (opponents, 0)
            other_car_physics = other_player.inverted_car_data if player.team_num == common_values.ORANGE_TEAM else other_player.car_data
            other_idx, diff = self.car_id_to_idx.get(other_player.car_id), other_car_physics.position - own_car_physics.position
            other_car_obs = [*other_car_physics.position/self.POS_STD, *other_car_physics.linear_velocity/self.VEL_STD, *other_car_physics.angular_velocity/self.ANG_STD, *diff/self.POS_STD, *(other_car_physics.linear_velocity-own_car_physics.linear_velocity)/self.VEL_STD, *(ball.position-other_car_physics.position)/self.POS_STD, *(ball.linear_velocity-other_car_physics.linear_velocity)/self.VEL_STD, *other_car_physics.forward(), *other_car_physics.up(), other_player.boost_amount, float(other_player.on_ground), float(other_player.has_flip), float(other_player.is_demoed), float(other_player.on_ground or other_player.has_jump), np.linalg.norm(diff)/self.POS_STD, is_teammate, self.demo_timers[other_idx]/3.0 if other_idx is not None else 0.0]
            target_list.append(other_car_obs)
        max_allies, max_opps = self.team_size - 1, self.team_size
        while len(allies) < max_allies: allies.append(self.dummy_player_obs)
        while len(opponents) < max_opps: opponents.append(self.dummy_player_obs)
        if allies: allies.sort(key=lambda x: x[-3]);
        if opponents: opponents.sort(key=lambda x: x[-3])
        final_obs = player_obs + ball_obs
        for lst in (allies[:max_allies] + opponents[:max_opps]): final_obs.extend(lst)
        final_obs.extend(boost_timers / 10.0)
        return np.asarray(final_obs, dtype=np.float32)



"""

with open("modern_coyote.py", "w") as f:
    f.write(modern_coyote_code)

print("Updated 'modern_coyote.py' with proper flip support written successfully!")


Copying collision meshes from '/kaggle/input/collmesh/collision_meshes' to './collision_meshes'...
Collision meshes copied successfully.
Updated 'modern_coyote.py' with proper flip support written successfully!


In [58]:
agent_env_code = r"""
import os
import numpy as np
from rlgym_sim import make
from rlgym_sim.utils.gamestates import GameState, PlayerData
from rlgym_sim.utils.reward_functions import RewardFunction
from rlgym_sim.utils.state_setters import StateSetter, DefaultState, StateWrapper
from rlgym_sim.utils.terminal_conditions.common_conditions import GoalScoredCondition, TimeoutCondition
from rlgym_sim.utils import common_values, math as rlgym_math

# --- Import from the file we just created ---
from modern_coyote import ModernCoyoteObsBuilder, ModernCoyoteAction

import random

# ==================================================================
# === REWARD FUNCTIONS ===
# ==================================================================

class CombinedReward(RewardFunction):
    def __init__(self, reward_functions, reward_weights=None):
        super().__init__()
        self.reward_functions = reward_functions
        self.reward_weights = reward_weights or np.ones(len(reward_functions))
        if len(self.reward_functions) != len(self.reward_weights):
            raise ValueError("Reward functions and weights must have the same length.")

    def reset(self, initial_state: GameState):
        for func in self.reward_functions:
            func.reset(initial_state)

    def pre_step(self, state: GameState):
        for func in self.reward_functions:
            func.pre_step(state)

    def get_reward(self, player: PlayerData, state: GameState, previous_action: np.ndarray) -> float:
        rewards = [func.get_reward(player, state, previous_action) for func in self.reward_functions]
        return float(np.dot(self.reward_weights, rewards))

# ----------------------------
# AdvancedDribbleReward
# ----------------------------
class AdvancedDribbleReward(RewardFunction):
    def __init__(self):
        super().__init__()
        self.last_player_touch = {}
        self.flick_in_progress = {}
        self.last_state: GameState = None
        self.weights = {
            "goal_reward": 15.0, "demo_reward": 15.0, "save_reward": 3.0,
            "touch_reward": 0.01, "dribble_proximity_reward": 0.01,
            "dribble_sustain_reward": 0.1, "flick_initiation_reward": 4.0,
            "dist_to_ball_exp": 0.15, "closer_to_ball_reward": 0.3,
            "turtling_penalty": -2.0, "wrong_way_penalty": -0.1,
            "boost_waste_penalty": -0.4
        }

    def reset(self, initial_state: GameState):
        self.last_state = initial_state
        for player in initial_state.players:
            self.last_player_touch[player.car_id] = False
            self.flick_in_progress[player.car_id] = False

    def get_reward(self, player: PlayerData, state: GameState, previous_action: np.ndarray) -> float:
        if self.last_state is None: return 0.0
        reward = 0.0
        opponent = next((p for p in state.players if p.team_num != player.team_num), None)
        if opponent is None: return 0.0
        
        last_player = next((p for p in self.last_state.players if p.car_id == player.car_id), None)
        if last_player is None: return 0.0

        if player.match_goals > last_player.match_goals: reward += self.weights["goal_reward"]
        if player.match_demolishes > last_player.match_demolishes: reward += self.weights["demo_reward"]
        if player.match_saves > last_player.match_saves: reward += self.weights["save_reward"]
        if player.ball_touched and not self.last_player_touch.get(player.car_id, False):
            reward += self.weights["touch_reward"]
        self.last_player_touch[player.car_id] = player.ball_touched

        player_to_ball = state.ball.position - player.car_data.position
        dist_to_ball = np.linalg.norm(player_to_ball)
        is_on_hood = 100 < state.ball.position[2] < 350
        is_close = dist_to_ball < 150
        is_dribbling = is_on_hood and is_close
        if is_dribbling:
            reward += self.weights["dribble_proximity_reward"] * (1 - (dist_to_ball / 150))
            reward += self.weights["dribble_sustain_reward"]
            ball_vel = state.ball.linear_velocity
            if ball_vel[2] > 400 and np.linalg.norm(ball_vel[:2]) > 300 and not self.flick_in_progress.get(player.car_id, False):
                reward += self.weights["flick_initiation_reward"]
                self.flick_in_progress[player.car_id] = True
        if not is_dribbling or state.ball.linear_velocity[2] < 0:
            self.flick_in_progress[player.car_id] = False

        last_dist_to_ball = np.linalg.norm(self.last_state.ball.position - last_player.car_data.position)
        reward += self.weights["dist_to_ball_exp"] * (last_dist_to_ball - dist_to_ball)

        opp_dist_to_ball = np.linalg.norm(state.ball.position - opponent.car_data.position)
        if dist_to_ball < opp_dist_to_ball: reward += self.weights["closer_to_ball_reward"]
        if player.on_ground and player.car_data.up()[2] < -0.8: reward += self.weights["turtling_penalty"]

        orange_goal_back_np = np.array(common_values.ORANGE_GOAL_BACK)
        blue_goal_back_np = np.array(common_values.BLUE_GOAL_BACK)
        own_goal_dir = -orange_goal_back_np if player.team_num == common_values.BLUE_TEAM else -blue_goal_back_np
        car_forward = player.car_data.forward()
        if is_dribbling and np.dot(car_forward, own_goal_dir) > 0.5:
            reward += self.weights["wrong_way_penalty"]
        if player.boost_amount < last_player.boost_amount:
            reward += self.weights["boost_waste_penalty"]
        return reward

    def pre_step(self, state: GameState): self.last_state = state

# ----------------------------
# ZeroSumReward
# ----------------------------
class ZeroSumReward(RewardFunction):
    def __init__(self, goal_w=10, velocity_bg_w=0.09, acel_ball_w=0.1,
                 boost_gain_w=1, jump_touch_w=3, cons_air_touches_w=8,
                 demo_w=6, kickoff_w=0.1, tick_skip=8, team_spirit=1):
        self.goal_w = goal_w
        self.velocity_bg_w = velocity_bg_w * (tick_skip / 8)
        self.acel_ball_w = acel_ball_w
        self.boost_gain_w = boost_gain_w
        self.boost_spend_w = 1.5 * self.boost_gain_w * ((33.3334 / (120 / tick_skip)) * 0.01)
        self.jump_touch_w = jump_touch_w
        self.cons_air_touches_w = cons_air_touches_w
        self.demo_w = demo_w
        self.kickoff_w = kickoff_w * (tick_skip / 8)
        self.rewards = None
        self.last_state: GameState = None
        self.touch_timeout = 8 * 120 // tick_skip
        self.kickoff_timeout = 5 * 120 // tick_skip
        self.kickoff_timer = 0
        self.closest_reset_blue = -1
        self.closest_reset_orange = -1
        self.blue_touch_timer = self.touch_timeout + 1
        self.orange_touch_timer = self.touch_timeout + 1
        self.blue_toucher = None
        self.orange_toucher = None
        self.team_spirit = team_spirit
        self.n = 0
        self.cons_touches = 0

    def pre_step(self, state: GameState):
        if self.last_state is None:
            self.last_state = state

        self.n = 0
        self.blue_touch_timer += 1
        self.orange_touch_timer += 1
        self.kickoff_timer += 1

        player_rewards = np.zeros(len(state.players))

        for i, player in enumerate(state.players):
            last = self.last_state.players[i]

            # Ball touches
            if player.ball_touched:
                if player.team_num == common_values.BLUE_TEAM:
                    self.blue_toucher = i
                    self.blue_touch_timer = 0
                else:
                    self.orange_toucher = i
                    self.orange_touch_timer = 0

                # Small reward for touching
                player_rewards[i] += 0.01

                # Reward for ball speed change
                vel_difference = abs(np.linalg.norm(self.last_state.ball.linear_velocity - state.ball.linear_velocity))
                player_rewards[i] += vel_difference / 4600.0

            # Reward for ball moving toward opponent goal
            enemy_goal = (common_values.ORANGE_GOAL_BACK if player.team_num == common_values.BLUE_TEAM 
                          else common_values.BLUE_GOAL_BACK)
            ball_dir = enemy_goal - state.ball.position
            ball_speed_toward_goal = max(0, np.dot(state.ball.linear_velocity, ball_dir) / (np.linalg.norm(ball_dir)+1e-6))
            player_rewards[i] += 0.01 * ball_speed_toward_goal / common_values.BALL_MAX_SPEED

            # Demo reward
            if player.match_demolishes > last.match_demolishes:
                player_rewards[i] += self.demo_w

            # Goal reward with ball speed factor
            if player.match_goals > last.match_goals:
                goal_speed = np.linalg.norm(self.last_state.ball.linear_velocity)
                player_rewards[i] += self.goal_w * (goal_speed / common_values.CAR_MAX_SPEED)

            # Kickoff reward (if applicable)
            if self.kickoff_timer < self.kickoff_timeout:
                player_rewards[i] += self.kickoff_w

        # ------------------------
        # ZERO-SUM ADJUSTMENT
        # ------------------------
        mid = len(player_rewards) // 2
        blue_mean = np.mean(player_rewards[:mid])
        orange_mean = np.mean(player_rewards[mid:])
        player_rewards[:mid] -= orange_mean
        player_rewards[mid:] -= blue_mean

        self.rewards = player_rewards
        self.last_state = state

    def get_reward(self, player: PlayerData, state: GameState, previous_action: np.ndarray) -> float:
        if self.rewards is None:
            return 0.0
        player_idx = next((i for i, p in enumerate(state.players) if p.car_id == player.car_id), None)
        if player_idx is None:
            return 0.0
        return float(self.rewards[player_idx])

    def reset(self, initial_state: GameState):
        self.last_state = initial_state
        self.rewards = np.zeros(len(initial_state.players))
        self.kickoff_timer = 0
        self.blue_touch_timer = self.orange_touch_timer = self.touch_timeout + 1
        self.blue_toucher = self.orange_toucher = None
        self.cons_touches = 0


# ----------------------------
# LowBoostVelocityReward
# ----------------------------
class LowBoostVelocityReward(RewardFunction):
    def __init__(self):
        super().__init__()

    def reset(self, initial_state: GameState): pass

    def get_reward(self, player: PlayerData, state: GameState, previous_action: np.ndarray) -> float:
        ball_vel_mag = np.linalg.norm(state.ball.linear_velocity)
        if ball_vel_mag < 1 and player.boost_amount < 0.02:
            player_vel = player.car_data.linear_velocity
            player_to_ball = state.ball.position - player.car_data.position
            dist_to_ball = np.linalg.norm(player_to_ball)
            if dist_to_ball < 1.0: return 0.0
            player_to_ball_unit = player_to_ball / dist_to_ball
            player_vel_dot = np.dot(player_vel, player_to_ball_unit)
            return max(0, player_vel_dot / common_values.CAR_MAX_SPEED)
        return 0.0


# ----------------------------
# ----------------------------
# DodgeReward (ignores normal jumps)
# ----------------------------
class DodgeReward(RewardFunction):
    def __init__(self, weight=1.0):
        
        super().__init__()
        self.weight = weight
        self.last_player_states = {}

    def reset(self, initial_state: GameState):
        
        self.last_player_states = {}
        for player in initial_state.players:
            # Store a copy of the initial player data
            self.last_player_states[player.car_id] = player

    def get_reward(self, player: PlayerData, state: GameState, previous_action: np.ndarray) -> float:
        
        
        
        reward = 0.0
        last_player = self.last_player_states.get(player.car_id)

        if last_player is not None:
            # A dodge/flip is used when has_flip goes from True to False and the car is not on the ground.
            if last_player.has_flip and not player.has_flip and not player.on_ground:
                reward = self.weight

        # Update the stored state for the next step
        self.last_player_states[player.car_id] = player
        return reward


# ----------------------------
# FastShotReward
# ----------------------------
class FastShotReward(RewardFunction):
    def __init__(self, weight=1.0):
        super().__init__()
        self.weight = weight
        self.last_state: GameState = None

    def reset(self, initial_state: GameState):
        self.last_state = initial_state

    def pre_step(self, state: GameState):
        self.last_state = state

    def get_reward(self, player: PlayerData, state: GameState, previous_action: np.ndarray) -> float:
        if self.last_state is None:
            return 0.0

        reward = 0.0
        # Only reward if the player just touched the ball
        last_player = next((p for p in self.last_state.players if p.car_id == player.car_id), None)
        if last_player is None:
            return 0.0

        if player.ball_touched and not last_player.ball_touched:
            # Direction toward opponent goal
            enemy_goal = np.array(common_values.ORANGE_GOAL_BACK if player.team_num == common_values.BLUE_TEAM
                                  else common_values.BLUE_GOAL_BACK)
            ball_dir = enemy_goal - state.ball.position
            ball_vel = state.ball.linear_velocity
            ball_speed_toward_goal = max(0, np.dot(ball_vel, ball_dir) / (np.linalg.norm(ball_dir)+1e-6))
            
            # Reward proportional to speed toward opponent goal
            reward += self.weight * ball_speed_toward_goal / common_values.BALL_MAX_SPEED

        return reward

# ----------------------------
# FastGoalReward
# ----------------------------
class FastGoalReward(RewardFunction):
    def __init__(self, max_reward=20.0, match_time=900):  # match_time in ticks or seconds
        
        super().__init__()
        self.max_reward = max_reward
        self.match_time = match_time
        self.start_tick = None
        self.last_state: GameState = None

    def reset(self, initial_state: GameState):
        self.start_tick = 0
        self.last_state = initial_state

    def pre_step(self, state: GameState):
        if self.start_tick is None:
            self.start_tick = 0
        else:
            self.start_tick += 1  # Increment time per step
        self.last_state = state

    def get_reward(self, player: PlayerData, state: GameState, previous_action: np.ndarray) -> float:
        if self.last_state is None:
            return 0.0

        last_player = next((p for p in self.last_state.players if p.car_id == player.car_id), None)
        if last_player is None:
            return 0.0

        reward = 0.0
        # Reward only when a new goal is scored by this player
        if player.match_goals > last_player.match_goals:
            # Scale reward based on how early the goal was scored
            time_factor = max(0, 1.0 - self.start_tick / self.match_time)
            reward += self.max_reward * time_factor

        return reward


# ==================================================================
# === STATE SETTERS ===
# ==================================================================

class ReplayStateSetter(StateSetter):
    def __init__(self, states_dir: str):
        super().__init__()
        self.states_dir = states_dir
        self.file_paths = []
        self.file_lengths = []
        self.total_states = 0
        if not os.path.isdir(self.states_dir):
            print(f"[ReplayStateSetter] Directory {self.states_dir} does not exist!")
            return
        print(f"[ReplayStateSetter] Loading .npy files from {self.states_dir}...")
        for filename in os.listdir(self.states_dir):
            if filename.endswith(".npy"):
                filepath = os.path.join(self.states_dir, filename)
                states = np.load(filepath, allow_pickle=True)
                if len(states) > 0:
                    self.file_paths.append(filepath)
                    self.file_lengths.append(len(states))
                    self.total_states += len(states)
                    print(f"[ReplayStateSetter] Loaded {len(states)} states from {filename}")
        print(f"[ReplayStateSetter] Finished loading. Total states: {self.total_states}")

    def reset(self, state_wrapper: StateWrapper):
        if self.total_states == 0:
            print("[ReplayStateSetter] No states available to set.")
            return
        chosen_path = random.choices(self.file_paths, weights=self.file_lengths, k=1)[0]
        states_array = np.load(chosen_path, allow_pickle=True)
        print(f"[ReplayStateSetter] Sampling from file: {os.path.basename(chosen_path)} ({len(states_array)} states)")

        num_states_in_file = len(states_array)
        if num_states_in_file > 300:
            sampled_states = random.sample(list(states_array), 300)
            chosen_state = random.choice(sampled_states)
        else:
            chosen_state = states_array[random.randint(0, num_states_in_file - 1)]
        print(f"[ReplayStateSetter] Chosen state index set in environment.")

        ball_state = chosen_state['ball']
        state_wrapper.ball.set_pos(*ball_state['position'])
        state_wrapper.ball.set_lin_vel(*ball_state['linear_velocity'])
        state_wrapper.ball.set_ang_vel(*ball_state['angular_velocity'])
        car_states = list(chosen_state['cars'].values())
        for i in range(min(len(car_states), len(state_wrapper.cars))):
            car_wrapper, car_state = state_wrapper.cars[i], car_states[i]
            car_wrapper.set_pos(*car_state['position'])
            euler_angles = rlgym_math.quat_to_euler(car_state['quaternion'])
            car_wrapper.set_rot(pitch=-euler_angles[0], yaw=euler_angles[1], roll=-euler_angles[2])
            car_wrapper.set_lin_vel(*car_state['linear_velocity'])
            car_wrapper.set_ang_vel(*car_state['angular_velocity'])
            car_wrapper.boost = car_state['boost_amount'] / 100.0




# ==================================================================
# === ENVIRONMENT BUILDER ===
# ==================================================================
def build_env():
    TICK_SKIP = 8
    
    reward_fn = CombinedReward(
        reward_functions=(
            AdvancedDribbleReward(),
            ZeroSumReward(tick_skip=TICK_SKIP),
            LowBoostVelocityReward(),
            DodgeReward(),
            FastShotReward(weight=1.5),
            FastGoalReward()# <-- new fast shot reward added

        ),
        reward_weights=(1.0, 1.0, 1.0, 1.0, 1.0, 1.0)
    )
    
    terminal_conditions = [TimeoutCondition(900), GoalScoredCondition()]
    obs_builder = ModernCoyoteObsBuilder(tick_skip=TICK_SKIP, team_size=1)
    action_parser = ModernCoyoteAction()
    
    replay_states_path = "/kaggle/input/npy123"
    state_setter = ReplayStateSetter(replay_states_path)
    
    env = make(
        team_size=1, spawn_opponents=True, tick_skip=TICK_SKIP,
        reward_fn=reward_fn, terminal_conditions=terminal_conditions,
        obs_builder=obs_builder, action_parser=action_parser, state_setter=state_setter
    )
   # import rocketsimvis_rlgym_sim_client as rsv
    type(env).render = lambda self: rsv.send_state_to_rocketsimvis(self._prev_state)
    return env

"""

with open("agent_env.py", "w") as f:
    f.write(agent_env_code)

print("Updated 'agent_env.py' with your custom rewards and state setters successfully!")


Updated 'agent_env.py' with your custom rewards and state setters successfully!


In [59]:

#
# CELL 3: Run the Main Training Script
#
import json
import os
import random
import shutil
import time
import sys
from typing import Union, Tuple
from collections import OrderedDict
import numpy as np
import torch
from rlgym_ppo.batched_agents import BatchedAgentManager
from rlgym_ppo.ppo import ExperienceBuffer, PPOLearner
from rlgym_ppo.util import WelfordRunningStat, reporting, torch_functions

# --- Add current directory to path to find our custom files ---
sys.path.insert(0, os.path.abspath('.'))
from agent_env import build_env

# --- FULLY MODIFIED LEARNER FOR KAGGLE & PRE-TRAINING ---
class KaggleLearner(object): # Renamed to avoid conflicts
    def __init__(
            # fmt: off
            self, env_create_function, metrics_logger=None, n_proc: int = 8, min_inference_size: int = 80,
            render: bool = False, render_delay: float = 0, timestep_limit: int = 5_000_000_000,
            exp_buffer_size: int = 100000, ts_per_iteration: int = 50000, standardize_returns: bool = True,
            standardize_obs: bool = True, max_returns_per_stats_increment: int = 150,
            steps_per_obs_stats_increment: int = 5, policy_layer_sizes: Tuple[int, ...] = (256, 256, 256),
            critic_layer_sizes: Tuple[int, ...] = (256, 256, 256), continuous_var_range: Tuple[float, ...] = (0.1, 1.0),
            ppo_epochs: int = 10, ppo_batch_size: int = 50000, ppo_minibatch_size: Union[int, None] = None,
            ppo_ent_coef: float = 0.005, ppo_clip_range: float = 0.2, gae_lambda: float = 0.95,
            gae_gamma: float = 0.99, policy_lr: float = 3e-4, critic_lr: float = 3e-4, log_to_wandb: bool = False,
            load_wandb: bool = True, wandb_run = None, wandb_project_name: Union[str, None] = None,
            wandb_group_name: Union[str, None] = None, wandb_run_name: Union[str, None] = None,
            checkpoints_save_folder: Union[str, None] = None, add_unix_timestamp: bool = True,
            checkpoint_load_folder: Union[str, None] = "latest", pretrained_policy_path: Union[str, None] = None,
            save_every_ts: int = 1_000_000, instance_launch_delay: Union[float, None] = None,
            random_seed: int = 123, n_checkpoints_to_keep: int = 5, shm_buffer_size: int = 8192, device: str = "auto"):
        # (Rest of __init__ is a direct copy of the provided learner.py)
        assert (env_create_function is not None), "MUST PROVIDE A FUNCTION TO CREATE RLGYM FUNCTIONS"
        if checkpoints_save_folder is None: checkpoints_save_folder = os.path.join("data", "checkpoints", "rlgym-ppo-run")
        self.add_unix_timestamp = add_unix_timestamp
        if add_unix_timestamp: checkpoints_save_folder = f"{checkpoints_save_folder}-{time.time_ns()}"
        torch.manual_seed(random_seed); np.random.seed(random_seed); random.seed(random_seed)
        self.n_checkpoints_to_keep, self.checkpoints_save_folder = n_checkpoints_to_keep, checkpoints_save_folder
        self.max_returns_per_stats_increment, self.metrics_logger = max_returns_per_stats_increment, metrics_logger
        self.standardize_returns, self.save_every_ts, self.ts_since_last_save = standardize_returns, save_every_ts, 0
        if device in {"auto", "gpu"} and torch.cuda.is_available(): self.device = "cuda:0"; torch.backends.cudnn.benchmark = True
        elif device == "auto" and not torch.cuda.is_available(): self.device = "cpu"
        else: self.device = device
        print(f"Using device {self.device}")
        self.exp_buffer_size, self.timestep_limit, self.ts_per_epoch = exp_buffer_size, timestep_limit, ts_per_iteration
        self.gae_lambda, self.gae_gamma, self.return_stats, self.epoch = gae_lambda, gae_gamma, WelfordRunningStat(1), 0
        self.experience_buffer = ExperienceBuffer(self.exp_buffer_size, seed=random_seed, device="cpu")
        print("Initializing processes..."); collect_metrics_fn = None if metrics_logger is None else self.metrics_logger.collect_metrics
        self.agent = BatchedAgentManager(None, min_inference_size=min_inference_size, seed=random_seed, standardize_obs=False, steps_per_obs_stats_increment=steps_per_obs_stats_increment)
        obs_space_size, act_space_size, action_space_type = self.agent.init_processes(n_processes=n_proc, build_env_fn=env_create_function, collect_metrics_fn=collect_metrics_fn, spawn_delay=instance_launch_delay, render=render, render_delay=render_delay, shm_buffer_size=shm_buffer_size)
        obs_space_size = np.prod(obs_space_size); print("Initializing PPO...")
        if ppo_minibatch_size is None: ppo_minibatch_size = ppo_batch_size
        self.ppo_learner = PPOLearner(obs_space_size, act_space_size, device=self.device, batch_size=ppo_batch_size, mini_batch_size=ppo_minibatch_size, n_epochs=ppo_epochs, continuous_var_range=continuous_var_range, policy_type=action_space_type, policy_layer_sizes=policy_layer_sizes, critic_layer_sizes=critic_layer_sizes, policy_lr=policy_lr, critic_lr=critic_lr, clip_range=ppo_clip_range, ent_coef=ppo_ent_coef)
        if pretrained_policy_path is not None and os.path.exists(pretrained_policy_path):
            print(f"Loading pre-trained model weights from: {pretrained_policy_path}")
            checkpoint = torch.load(pretrained_policy_path, map_location=self.device)
            if 'model_state_dict' in checkpoint: pretrained_dict = checkpoint['model_state_dict']; print("Checkpoint dictionary found, extracting 'model_state_dict'.")
            else: pretrained_dict = checkpoint; print("Raw state_dict found.")
            is_data_parallel = any(key.startswith('module.') for key in pretrained_dict.keys())
            if is_data_parallel:
                print("DataParallel 'module.' prefix detected. Stripping prefix...")
                clean_state_dict = OrderedDict()
                for k, v in pretrained_dict.items(): clean_state_dict[k[7:]] = v
                pretrained_dict = clean_state_dict
            if any(key.startswith('layers.') for key in pretrained_dict.keys()):
                print("Renaming 'layers.' keys to 'model.' for compatibility...")
                renamed_dict = OrderedDict()
                for k, v in pretrained_dict.items():
                    if k.startswith('layers.'): renamed_dict['model.' + k[len('layers.'):]] = v
                    else: renamed_dict[k] = v
                pretrained_dict = renamed_dict
            self.ppo_learner.policy.load_state_dict(pretrained_dict)
            print("Successfully loaded pre-trained policy weights.")
        self.agent.policy = self.ppo_learner.policy
        self.config = {"n_proc": n_proc, "min_inference_size": min_inference_size, "timestep_limit": timestep_limit, "exp_buffer_size": exp_buffer_size, "ts_per_iteration": ts_per_iteration, "standardize_returns": standardize_returns, "standardize_obs": standardize_obs, "policy_layer_sizes": policy_layer_sizes, "critic_layer_sizes": critic_layer_sizes, "ppo_epochs": ppo_epochs, "ppo_batch_size": ppo_batch_size, "ppo_minibatch_size": ppo_minibatch_size, "ppo_ent_coef": ppo_ent_coef, "ppo_clip_range": ppo_clip_range, "gae_lambda": gae_lambda, "gae_gamma": gae_gamma, "policy_lr": policy_lr, "critic_lr": critic_lr, "shm_buffer_size": shm_buffer_size}
        self.wandb_run = wandb_run
        wandb_loaded = checkpoint_load_folder is not None and self.load(checkpoint_load_folder, load_wandb, policy_lr, critic_lr)
        if log_to_wandb and self.wandb_run is None and not wandb_loaded:
            project = "rlgym-ppo" if wandb_project_name is None else wandb_project_name
            group = "unnamed-runs" if wandb_group_name is None else wandb_group_name
            run_name = "rlgym-ppo-run" if wandb_run_name is None else wandb_run_name
            print("Wandb not supported in this Kaggle version.")
        print("Learner successfully initialized!")
    
    # Notebook-safe learning loop
    def _learn(self):
        print("Starting training loop. This will run until the timestep limit is reached or the notebook is stopped.\n")
        while self.agent.cumulative_timesteps < self.timestep_limit:
            epoch_start = time.perf_counter()
            report = {}
            experience, collected_metrics, steps_collected, collection_time = self.agent.collect_timesteps(self.ts_per_epoch)
            if self.metrics_logger is not None: self.metrics_logger.report_metrics(collected_metrics, self.wandb_run, self.agent.cumulative_timesteps)
            self.add_new_experience(experience)
            ppo_report = self.ppo_learner.learn(self.experience_buffer)
            epoch_stop = time.perf_counter()
            epoch_time = epoch_stop - epoch_start
            report.update(ppo_report)
            if self.epoch < 1: report["Value Function Loss"] = np.nan
            report["Cumulative Timesteps"] = self.agent.cumulative_timesteps
            report["Total Iteration Time"] = epoch_time
            report["Timesteps Collected"] = steps_collected
            report["Timestep Collection Time"] = collection_time
            report["Timestep Consumption Time"] = epoch_time - collection_time
            report["Collected Steps per Second"] = steps_collected / collection_time
            report["Overall Steps per Second"] = steps_collected / epoch_time
            self.ts_since_last_save += steps_collected
            if self.agent.average_reward is not None: report["Policy Reward"] = self.agent.average_reward
            else: report["Policy Reward"] = np.nan
            reporting.report_metrics(loggable_metrics=report, debug_metrics=None, wandb_run=self.wandb_run)
            report.clear(); ppo_report.clear()
            if self.ts_since_last_save >= self.save_every_ts:
                self.save(self.agent.cumulative_timesteps)
                self.ts_since_last_save = 0
            self.epoch += 1

    # (Rest of methods are direct copies from provided learner.py)
    def learn(self):
        try: self._learn()
        except Exception: import traceback; print("\n\nLEARNING LOOP ENCOUNTERED AN ERROR\n"); traceback.print_exc();
        try: self.save(self.agent.cumulative_timesteps)
        except: print("FAILED TO SAVE ON EXIT")
        finally: self.cleanup()
    @torch.no_grad()
    def add_new_experience(self, experience):
        states, actions, log_probs, rewards, next_states, dones, truncated = experience
        val_inp = np.zeros(shape=(states.shape[0] + 1, states.shape[1])); val_inp[:-1] = states; val_inp[-1] = next_states[-1]
        val_preds = self.ppo_learner.value_net(val_inp).cpu().flatten().tolist(); torch.cuda.empty_cache()
        ret_std = self.return_stats.std[0] if self.standardize_returns else None
        value_targets, advantages, returns = torch_functions.compute_gae(rewards, dones, truncated, val_preds, gamma=self.gae_gamma, lmbda=self.gae_lambda, return_std=ret_std)
        if self.standardize_returns: n_to_increment = min(self.max_returns_per_stats_increment, len(returns)); self.return_stats.increment(returns[:n_to_increment], n_to_increment)
        self.experience_buffer.submit_experience(states, actions, log_probs, rewards, next_states, dones, truncated, value_targets, advantages)
    def save(self, cumulative_timesteps):
        folder_path = os.path.join(self.checkpoints_save_folder, str(cumulative_timesteps))
        os.makedirs(folder_path, exist_ok=True); print(f"Saving checkpoint {cumulative_timesteps}...")
        existing_checkpoints = [int(arg) for arg in os.listdir(self.checkpoints_save_folder)]
        if len(existing_checkpoints) > self.n_checkpoints_to_keep:
            existing_checkpoints.sort()
            for checkpoint_name in existing_checkpoints[:-self.n_checkpoints_to_keep]: shutil.rmtree(os.path.join(self.checkpoints_save_folder, str(checkpoint_name)))
        os.makedirs(folder_path, exist_ok=True); self.ppo_learner.save_to(folder_path)
        book_keeping_vars = {"cumulative_timesteps": self.agent.cumulative_timesteps, "cumulative_model_updates": self.ppo_learner.cumulative_model_updates, "policy_average_reward": self.agent.average_reward, "epoch": self.epoch, "ts_since_last_save": self.ts_since_last_save, "reward_running_stats": self.return_stats.to_json()}
        if self.agent.standardize_obs: book_keeping_vars["obs_running_stats"] = self.agent.obs_stats.to_json()
        if self.standardize_returns: book_keeping_vars["reward_running_stats"] = self.return_stats.to_json()
        if self.wandb_run is not None: book_keeping_vars["wandb_run_id"] = self.wandb_run.id
        with open(os.path.join(folder_path, "BOOK_KEEPING_VARS.json"), "w") as f: json.dump(book_keeping_vars, f, indent=4)
        print(f"Checkpoint {cumulative_timesteps} saved!\n")
    def load(self, folder_path, load_wandb, new_policy_lr=None, new_critic_lr=None):
        if folder_path == "latest":
            save_folder = self.checkpoints_save_folder
            if save_folder is None: return
            if self.add_unix_timestamp:
                base_save_folder = save_folder[:save_folder.rfind('-')]
                save_path = os.path.dirname(base_save_folder)
                if not os.path.exists(save_path): return
                highest_timestamp = -1; best_folder = None
                for filename in os.listdir(save_path):
                    full_path = os.path.join(save_path, filename)
                    if not os.path.isdir(full_path): continue
                    if full_path.startswith(base_save_folder):
                        unix_start_idx = full_path.rfind('-') + 1
                        if unix_start_idx > 0:
                            unix_time_str = filename[unix_start_idx:]
                            if unix_time_str.isdigit():
                                timestamp = int(unix_time_str)
                                if timestamp > highest_timestamp: highest_timestamp = timestamp; best_folder = full_path
                if not (best_folder is None): load_base_path = best_folder
                else: return
            else:
                if os.path.exists(self.checkpoints_save_folder): load_base_path = self.checkpoints_save_folder
                else: return
            highest_ts = -1
            for filename in os.listdir(load_base_path):
                if not os.path.isdir(os.path.join(load_base_path, filename)): continue
                if not filename.isdigit(): continue
                highest_ts = max(highest_ts, int(filename))
            if highest_ts != -1: folder_path = os.path.join(load_base_path, str(highest_ts)); print(f"Auto-load path: {folder_path}")
            else: return
        assert os.path.exists(folder_path), f"UNABLE TO LOCATE FOLDER {folder_path}"
        print(f"Loading from checkpoint at {folder_path}"); self.ppo_learner.load_from(folder_path)
        with open(os.path.join(folder_path, "BOOK_KEEPING_VARS.json"), "r") as f:
            book_keeping_vars = dict(json.load(f)); self.agent.cumulative_timesteps = book_keeping_vars["cumulative_timesteps"]
            self.agent.average_reward = book_keeping_vars["policy_average_reward"]; self.ppo_learner.cumulative_model_updates = book_keeping_vars["cumulative_model_updates"]
            self.return_stats.from_json(book_keeping_vars["reward_running_stats"])
            if self.agent.standardize_obs and "obs_running_stats" in book_keeping_vars.keys(): self.agent.obs_stats = WelfordRunningStat(1); self.agent.obs_stats.from_json(book_keeping_vars["obs_running_stats"])
            if self.standardize_returns and "reward_running_stats" in book_keeping_vars.keys(): self.return_stats.from_json(book_keeping_vars["reward_running_stats"])
            self.epoch = book_keeping_vars["epoch"]
        print("Checkpoint loaded!")
        return False
    def cleanup(self):
        if self.wandb_run is not None: self.wandb_run.finish()
        if type(self.agent) == BatchedAgentManager: self.agent.cleanup()
        self.experience_buffer.clear()


if __name__ == '__main__':
    # --- HYPERPARAMETERS ---
    # Using the stable parameters from our previous successful run
    HPARAMS = {
        "n_proc": 4,  # Use 4 CPUs as requested
        "timestep_limit": 500_000_000,  # Set a very high limit for a long run
        "ts_per_iteration": 40_000,
        "exp_buffer_size": 80_000,
        "ppo_batch_size": 10_000,
        "ppo_epochs": 1,
        "policy_lr": 1e-5, # CRITICAL: Use the stable, low learning rate
        "critic_lr": 1e-6, # CRITICAL: Use the stable, higher critic learning rate
        "gae_gamma": 0.995,
        "ppo_ent_coef": 0.01,
        "ppo_clip_range": 0.1,
        "policy_layer_sizes": (512, 512, 512, 512, 512, 512),
        "critic_layer_sizes": (256, 256, 256),
        "checkpoints_save_folder": "/kaggle/working/models/coyote_long_run/",
        "checkpoint_load_folder": "/kaggle/input/dribblegp2",
        "save_every_ts": 500_000, # Save progress more frequently
        "pretrained_policy_path": "/kaggle/input/botmodel/best_coyote_bot.pt", # <--- PATH IS NOW CORRECT
    }
    
    # --- START TRAINING ---
    model_dir = HPARAMS["checkpoints_save_folder"]
    os.makedirs(model_dir, exist_ok=True)
    
    agent = KaggleLearner(
        env_create_function=build_env,
        **HPARAMS
    )
    
    print(f"Starting long-duration training for {os.path.basename(os.path.normpath(model_dir))}...")
    print(f"Using {HPARAMS['n_proc']} CPUs.")
    
    agent.learn()



Using device cpu
Initializing processes...


100%|██████████| 4/4 [00:00<00:00, 248.92it/s]


Initializing PPO...
Trainable Parameters:
Component  Count     
--------------------
Policy     1564533   
Critic     161793    
--------------------
Total      1726326   
Current Policy Learning Rate: 1e-05
Current Critic Learning Rate: 1e-06
Loading pre-trained model weights from: /kaggle/input/botmodel/best_coyote_bot.pt
Checkpoint dictionary found, extracting 'model_state_dict'.
Renaming 'layers.' keys to 'model.' for compatibility...
Successfully loaded pre-trained policy weights.
Loading from checkpoint at /kaggle/input/dribblegp2
LOADED RUNNING STATS FROM JSON | Mean: [12.31835126] | Variance: [80634574.8673468] | Count: 337350
LOADED RUNNING STATS FROM JSON | Mean: [12.31835126] | Variance: [80634574.8673468] | Count: 337350
Checkpoint loaded!
Learner successfully initialized!
Starting long-duration training for coyote_long_run...
Using 4 CPUs.
Starting training loop. This will run until the timestep limit is reached or the notebook is stopped.

--------BEGIN ITERATION REPORT--

Process ForkServerProcess-75:
Process ForkServerProcess-74:
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/kaggle/working/rlgym-ppo/rlgym_ppo/batched_agents/batched_agent.py", line 101, in batched_agent_process
    message_bytes = pipe.recv(4096)
                    ^^^^^^^^^^^^^^^
KeyboardInterrupt
Process ForkServerProcess-73:
Traceback (most recent call last):
  File "/usr/lib/python3.11/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/lib/python3.11/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/kaggle/working/rlgym-ppo/rlgym_ppo/batched_agents/batched_agent.py", line 101, in batched_agent_process
    message_bytes = pipe.recv(4096)
                    ^^^^^^^^^^^^^^^
KeyboardInterrupt
Tr

KeyboardInterrupt: 

In [60]:
import os
import shutil

# Define source and destination
src = "/kaggle/working/models/coyote_long_run/-1758662684247525730/118563098/PPO_POLICY_OPTIMIZER.pt"
dst_dir = "/kaggle/output"
dst = os.path.join(dst_dir, "PPO_POLICY_OPTIMIZER.pt")

# Make sure output directory exists
os.makedirs(dst_dir, exist_ok=True)

# Copy file
shutil.copy2(src, dst)

print(f"Copied {src} -> {dst}")


Copied /kaggle/working/models/coyote_long_run/-1758662684247525730/118563098/PPO_POLICY_OPTIMIZER.pt -> /kaggle/output/PPO_POLICY_OPTIMIZER.pt
