In [None]:
#load libraries
import pandas as pd
import numpy as np
import copy
from collections import deque, defaultdict
import joblib
import h3
import matplotlib.pyplot as plt
import os
import gc

import q_utils.abmQ as abm3
import q_utils.repositioning as repositioning2
from q_utils.RLSystemMetrics import SystemMonitor
from q_utils.Q_agent import QLearningAgent, EnhancedRewardCalculator, EnhancedStateHandler
from q_utils.RLAgentVisualizer import RLAgentVisualizer

In [None]:
#Load training data
data_4 = pd.read_csv("Data/TrainData.csv")

In [None]:
# Unix time for one week before test data as training data
initial_timestart = 1666591200-86400*7
orders = data_4
start_time = pd.to_datetime(initial_timestart, unit='s') + pd.Timedelta(hours=8)

# 3 hours
end_time = start_time + pd.Timedelta(hours=3)

# Filter for time
orders['platform_order_date'] = pd.to_datetime(orders['platform_order_date'])
orders = orders[
    (orders['platform_order_date'] > start_time) &
    (orders['platform_order_date'] < end_time)
]

# Create bins
orders['time_bin'] = orders['platform_order_date'].dt.floor('15min')

# Number of orders per bin
actual_demand = (
    orders.groupby(['time_bin', 'hex_id'])
    .size()
    .reset_index(name='actual_order_count')
)

# Same format as for prediction
wide_actual = actual_demand.pivot(
    index='time_bin',
    columns='hex_id',
    values='actual_order_count'
).fillna(0)
pre_binned_demand = wide_actual.to_dict(orient='index')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  orders['time_bin'] = orders['platform_order_date'].dt.floor('15min')


In [None]:
#Initialising the state handler with reward calculator, Q-Agent and for visuals the system monitor and visualizer
state_handler = EnhancedStateHandler(n_distance_bins=10, n_courier_bins=10)
reward_calculator = EnhancedRewardCalculator(base_reward=1.0)
agent = QLearningAgent(learning_rate=0.2, discount_factor=0.95, epsilon=1.0)
monitor = SystemMonitor()
visualizer = RLAgentVisualizer(agent, state_handler, grid=None, monitor=monitor)
steps = 30
#The constants as for the ABM
constants = {
    'initial_timestart': 1666591200-86400*7,
    'SPEED_HEX_PER_STEP': 8,
    'simulation_duration_hours': 3,
    'steps': 30,
    'repositioning_interval': 15 * 60,
    'MAX_ACCEPTABLE_DELAY_SECONDS': 5 * 60,
    'MAX_QUEUE_ATTEMPTS': 20,
    'pre_binned_demand': pre_binned_demand,
    'MACRO_RESOLUTION': 8,
    'WORK_RESOLUTION': 13
}
timestart = constants['initial_timestart']
timeend = timestart + 3600 * constants['simulation_duration_hours']
NUM_EPISODES = 500 
epsilon_decay = 0.995

sim_data_master = data_4.copy() 
rejection_model = joblib.load('Data/rejection_model.joblib')

timestart = constants['initial_timestart']
last_order_time = sim_data_master['platform_order_time'].max()
TIME_BUFFER_SECONDS = 15 * 60
timeend = last_order_time + TIME_BUFFER_SECONDS

episode_metrics = defaultdict(list)

base_coverage = 0.8
variation_range = 0.3
target_utilization = 0.25

WARMUP_DURATION_SECONDS = 30*60

learning_start_time = timestart + WARMUP_DURATION_SECONDS

In [None]:
for episode in range(NUM_EPISODES):
    
    # Logic for calculating dynamic courier numbers
    avg_order_duration = 1800
    orders_in_window = len(sim_data_master) / constants.get('simulation_duration_hours', 3)
    required_courier_hours = (orders_in_window * avg_order_duration) / (3600*3)
    coverage_adjustment = np.random.uniform(-variation_range, variation_range)
    adjusted_coverage = max(0.3, min(0.9, base_coverage + coverage_adjustment))
    num_couriers_for_episode = max(int(required_courier_hours / target_utilization * adjusted_coverage), 5)
    
    couriers = abm3.initiate_couriers(
        total_couriers_to_create=num_couriers_for_episode,
        data_source_df=sim_data_master
    )    

    # Initialising the episode
    order_queue = []
    episode_reward_sum = 0
    decision_counter = 0
    delivered_orders = set()
    current_time = timestart
    
    print(f"--- Starte Episode {episode + 1}/{NUM_EPISODES} | Epsilon: {agent.epsilon:.3f} | Kuriere: {len(couriers)} ---")

    # Pre warmup for courier repositioning 
    warmup_seconds = 15 * 60 
    warmup_start_time_pre = constants['initial_timestart']  - warmup_seconds
    for t in range(warmup_start_time_pre, constants['initial_timestart'] , constants['steps'] ):
        couriers, _, _ = abm3.move_couriers_new(couriers, timestart, (0,0,0,0), delivered_orders, constants['SPEED_HEX_PER_STEP'], constants['steps'])
        first_bin_key = pd.to_datetime(constants['initial_timestart'] , unit='s').floor('15min') + pd.Timedelta(hours=8)
        dynamic_demand = pre_binned_demand.get(first_bin_key, {})
        if dynamic_demand:
            repositioning2.run_repositioning_strategy(couriers, dynamic_demand, t, [], constants['SPEED_HEX_PER_STEP'], constants['steps'], constants['MACRO_RESOLUTION'], constants['WORK_RESOLUTION'])

    while current_time < timeend:
        
        # Update the system
        couriers, delivered_orders = abm3.update_couriers_and_system(
            current_time, constants['steps'], couriers, delivered_orders, constants, pre_binned_demand, order_queue
        )

        # Prepare orders
        new_orders = abm3.get_new_orders(current_time, constants['steps'], sim_data_master)
        orders_to_process = order_queue + [(order, 0) for order in new_orders]
        next_order_queue = []

        # Decide and learn
        for order, attempts in orders_to_process:
            # Needed to decide whether the job remains in the queue
            success = False 

            # When an order waits too long
            if attempts > constants.get('MAX_QUEUE_ATTEMPTS', 5):
                #that the order is not too long in queue 
                processed_ids_set = set() 
                success, _ = abm3.handle_standard_assignment(
                    order, attempts, couriers, current_time, constants, rejection_model, processed_ids_set
                )
                # No training because of a rule based decision

            # Follow up of agent
            else:

            # 30 minutes warmup for courier
                if current_time < learning_start_time:
                    action = 0 #then always direct delivery
                    state_features = state_handler.get_state_features(order, couriers)

                    # Ignore the reward
                    _reward, _done, success = abm3.execute_decision_for_order(
                        order, action, couriers, current_time, constants, 
                        reward_calculator, state_handler, rejection_model, state_features
                    )
                    # Ignore training

                # After 30 minutes
                else:
                    # get state features
                    state_features = state_handler.get_state_features(order, couriers)
                    state = state_handler.discretize_state(state_features)

                    # get action of agent from table
                    action = agent.get_action(state)

                    # execute decision and receive reward
                    reward, done, success = abm3.execute_decision_for_order(
                        order, action, couriers, current_time, constants, 
                        reward_calculator, state_handler, rejection_model, state_features
                    )

                    # Get next state
                    next_state_features = state_handler.get_state_features(order, couriers)
                    next_state = state_handler.discretize_state(next_state_features)

                    # Agent learns 
                    agent.learn(state, action, reward, next_state)

                    # Update metrics
                    episode_reward_sum += reward
                    decision_counter += 1
                    visualizer.record_decision(order, state_features, state, action, reward, couriers)

            # Update Queue
            if not success:
                next_order_queue.append((order, attempts + 1))
        
        order_queue = next_order_queue
        
        #From old logic does nothing but get active couriers
        active_couriers = [c for c in couriers if c.state != 'INACTIVE']
        if active_couriers:
            available_count = sum(1 for c in active_couriers if c.state == 'IDLE')
            busy_count = len(active_couriers) - available_count
            inactive_count = len(couriers) - len(active_couriers)
            
            # NOTE: The monitor was designed for a different loop structure.
            # We are feeding it with aggregated data from this timestep for visualization purposes.
            # Here, we collect the actions taken on NEW orders in this step.
            decisions_in_step = []
            for order, attempts in orders_to_process:
                if attempts == 0: # This was a new order
                    # We need to re-calculate the action for the monitor as it's not stored
                    state_features = state_handler.get_state_features(order, couriers)
                    state = state_handler.discretize_state(state_features)
                    action = agent.get_action(state)
                    decisions_in_step.append(action)

            # Record the state of this timestep in the monitor
            monitor.record_step(
                available_count,
                busy_count,
                inactive_count,
                new_orders, 
                decisions_in_step, 
                [episode_reward_sum / decision_counter if decision_counter > 0 else 0] 
            )
            
        current_time += constants['steps']


    # End of episode
    monitor.end_episode() #monitor episode
    agent.decrease_epsilon(decay=epsilon_decay) #update epsilon
    
    avg_reward_per_decision = episode_reward_sum / decision_counter if decision_counter > 0 else 0 #calculate average reward per decision
    episode_metrics['rewards'].append(avg_reward_per_decision) #append it for metrics in a list
    
    print(
        f"Episode {episode + 1} done. "
        f"Decisions: {decision_counter} | "
        f"Avg. Reward/Order: {avg_reward_per_decision:.3f}"
    )
        
    # --- VISUALISIERUNG ---
    plot_dir = "training_plots"
    os.makedirs(plot_dir, exist_ok=True)
    if (episode + 1) % 50 == 0:
        print("\n--- Evaluations-Checkpoint ---")
        try:
            fig_policy = visualizer.visualize_policy()
            policy_plot_filename = os.path.join(plot_dir, f"policy_episode_{episode + 1}.svg")
            plt.savefig(policy_plot_filename, format='svg', bbox_inches='tight')
            plt.close()
            
            fig_per_decision = visualizer.visualize_learning_progress(reward_metric='per_decision')
            learning_plot_filename = os.path.join(plot_dir, f"learning_episode_{episode + 1}.svg")
            plt.savefig(learning_plot_filename, format='svg', bbox_inches='tight') 
            plt.close()
            
        except Exception as e:
            print(f"Errroooorororo': {e}")
        finally:
            print("---------------------------\n")

--- Starte Episode 1/1000 | Epsilon: 1.000 | Kuriere: 743 ---


KeyboardInterrupt: 

In [None]:
joblib.dump(agent.q_table, 'q_learning_agent_nbins_00_1000.joblib')

print("Training abgeschlossen und Q-Tabelle in 'q_learning_agent_final.joblib' gespeichert!")