In [None]:
#load libraries
import pandas as pd
import numpy as np
import joblib
from collections import defaultdict
import matplotlib.pyplot as plt
import gc
import os


from rl_utils.ppo_agent import PPOAgent
from rl_utils.ppo_agent import EnhancedRewardCalculator, EnhancedStateHandler

import rl_utils.abm3 as abm3
import rl_utils.repositioning as repositioning

from rl_utils.PPOSystemMetrics import SystemMonitor
from rl_utils.PPOAgentVisualizer import RLAgentVisualizer

In [21]:
orders = pd.read_csv("Data/TrainData.csv")
rejection_model = joblib.load('Data/rejection_model.joblib')


In [None]:
# We train on the day one week before
initial_timestart = 1666591200-86400*7
start_time = pd.to_datetime(initial_timestart, unit='s') + pd.Timedelta(hours=8)

# End of three hours period
end_time = start_time + pd.Timedelta(hours=3)


In [None]:
orders['platform_order_date'] = pd.to_datetime(orders['platform_order_date'])
orders = orders[
    (orders['platform_order_date'] > start_time) &
    (orders['platform_order_date'] < end_time)
]

# Create bins for "demand prediction", here we take the actual demand for repositioning
orders['time_bin'] = orders['platform_order_date'].dt.floor('15min')

# Numbers of orders per bin
actual_demand = (
    orders.groupby(['time_bin', 'hex_id'])
    .size()
    .reset_index(name='actual_order_count')
)

# The same format as our predicted values for the ABM.ipynb
wide_actual = actual_demand.pivot(
    index='time_bin',
    columns='hex_id',
    values='actual_order_count'
).fillna(0)
pre_binned_demand = wide_actual.to_dict(orient='index')

In [None]:
state_handler = EnhancedStateHandler(grid= None) #Activate state handler
reward_calculator = EnhancedRewardCalculator(base_reward=1.0) #Activate reward handler with base values
state_dim = 4  # [norm_distance, courier_availability]
action_dim = 2 # [0: direct, 1: split]
agent = PPOAgent(state_dim=state_dim, action_dim=action_dim)#
monitor = SystemMonitor()
visualizer = RLAgentVisualizer(agent, state_handler, grid=None, monitor=monitor)

NUM_EPISODES = 200
TRAIN_INTERVAL = 512
WARMUP_DURATION_SECONDS = 30 * 60


constants = {
    'initial_timestart': 1666591200-86400*7,
    'SPEED_HEX_PER_STEP': 8,
    'simulation_duration_hours': 3,
    'steps': 30,
    'repositioning_interval': 15 * 60,
    'MAX_ACCEPTABLE_DELAY_SECONDS': 5 * 60,
    'MAX_QUEUE_ATTEMPTS': 20,
    'pre_binned_demand': pre_binned_demand,
    'MACRO_RESOLUTION': 8,
    'WORK_RESOLUTION': 13
}
sim_data_master = orders.copy() 
timestart = constants['initial_timestart']
last_order_time = sim_data_master['platform_order_time'].max()
TIME_BUFFER_SECONDS = 15 * 60
timeend = last_order_time + TIME_BUFFER_SECONDS


rejection_model = joblib.load('Data/rejection_model.joblib')

episode_metrics = defaultdict(list)

base_coverage = 0.8
variation_range = 0.3
target_utilization = 0.25

learning_start_time = timestart + WARMUP_DURATION_SECONDS

In [25]:
# ===================================================================
# HAUPT-TRAININGSSCHLEIFE
# ===================================================================
experienced_states_for_plotting = []
for episode in range(NUM_EPISODES):

    current_time = timestart
    order_queue = []
    availability_history = []

    avg_order_duration = 1800
    orders_in_window = len(sim_data_master) / constants.get('simulation_duration_hours', 3)
    required_courier_hours = (orders_in_window * avg_order_duration) / (3600*3)

    coverage_adjustment = np.random.uniform(-variation_range, variation_range)
    adjusted_coverage = max(0.3, min(0.9, base_coverage + coverage_adjustment))
    num_couriers_for_episode = max(int(required_courier_hours / target_utilization * adjusted_coverage), 5)
    

    couriers = abm3.initiate_couriers(
            total_couriers_to_create=num_couriers_for_episode,
            data_source_df=sim_data_master
        )    

    episode_reward_sum = 0
    decision_counter = 0
    delivered_orders = set()
    current_time = timestart
    
    
    print(f"--- Starte Episode {episode + 1}/{NUM_EPISODES} | Kuriere: {num_couriers_for_episode} ---")

    
            #As we think that the fleet would not start at point zero
    warmup_seconds = 15 * 60 
    warmup_start_time = constants['initial_timestart']  - warmup_seconds
    
    for t in range(warmup_start_time, constants['initial_timestart'] , constants['steps'] ):
        # Move couriers in repositioning task
        couriers, _, delivered_order_ids = abm3.move_couriers_new(couriers, 
        timestart, 
        (0, 0, 0, 0), # Metriken werden jetzt extern getrackt
        delivered_orders,
        constants['SPEED_HEX_PER_STEP'], 
        constants['steps']
    )

            # Use the pre-binned forecast for the first time-slot as the warm-up target.
        first_bin_key = pd.to_datetime(constants['initial_timestart'] , unit='s').floor('15min') + pd.Timedelta(hours=8)
        dynamic_demand = pre_binned_demand.get(first_bin_key, {})

        # assign reposition tasks
        if dynamic_demand:
            repositioning.run_repositioning_strategy(couriers, dynamic_demand, t, [], 
                constants['SPEED_HEX_PER_STEP'], constants['steps'], 
                constants['MACRO_RESOLUTION'], constants['WORK_RESOLUTION']
            )

    while current_time < timeend:
        
        # 1. SYSTEM-UPDATE: Bewege Kuriere und verarbeite abgeschlossene Aufträge
        couriers, delivered_orders = abm3.update_couriers_and_system(
            current_time, constants['steps'], couriers, delivered_orders, constants, pre_binned_demand, order_queue
        )

        if current_time >= learning_start_time:
            active_couriers = [c for c in couriers if c.state != 'INACTIVE']
            if active_couriers:
                idle_couriers = [c for c in active_couriers if c.state == 'IDLE']
                current_availability = len(idle_couriers) / len(active_couriers)
                availability_history.append(current_availability)
                # 2. DATEN-ABRUF: Hole neue Aufträge für diesen Zeitschritt


        new_orders = abm3.get_new_orders(current_time, constants['steps'], sim_data_master)

    # 3. NEUE WARTESCHLANGEN-LOGIK: Alle anstehenden Aufträge vorbereiten
        all_pending_orders = order_queue + [(order, 0) for order in new_orders]
        next_order_queue = [] # Bereitet die Queue für den nächsten Zeitschritt vor
        global_features = state_handler.get_global_state_features(couriers, all_pending_orders)

        # 4. ENTSCHEIDUNGSFINDUNG: Verarbeite alle anstehenden Aufträge
        for order, attempts in all_pending_orders:
            
            success = False # Wird für die Entscheidung benötigt, ob der Auftrag in der Queue bleibt

            # 4a. FALLBACK-LOGIK: Wenn ein Auftrag zu lange wartet
            if attempts > constants.get('MAX_QUEUE_ATTEMPTS', 5):
                # Erzwinge eine Standard-Zuweisung ohne Agenten-Entscheidung
                # WICHTIG: handle_standard_assignment muss auch `success` zurückgeben
                processed_ids_set = set() # Platzhalter
                success, _ = abm3.handle_standard_assignment(
                    order, attempts, couriers, current_time, constants, rejection_model, processed_ids_set
                )
                # Hier findet kein Training des Agenten statt, da es eine Regel-basierte Entscheidung war.

            # 4b. AGENTEN-LOGIK: Standard-Verarbeitung durch den PPO-Agenten
            else:
                #state_features = state_handler.get_state_features(order, couriers, all_pending_orders) # Nutze die komplette Liste für den State
                

                order_distance_feature = state_handler.get_order_specific_feature(order)
                state_features = np.concatenate(([order_distance_feature], global_features))
                experienced_states_for_plotting.append(state_features)

                # --- WARM-UP PHASE ---
                if current_time < learning_start_time:
                    action = 0 # Standard-Aktion
                    _, _, success = abm3.execute_decision_for_order( # WICHTIG: success wird hier ausgelesen
                        order, action, couriers, current_time, constants, 
                        reward_calculator, state_handler, rejection_model, state_features
                    )
                # --- LERN-PHASE ---
                else:
                    action, log_prob, action_probs_tensor = agent.select_action(state_features)
                    action_probs = action_probs_tensor.numpy()[0] # Konvertiere zu NumPy-Array                                             
                    # Führe Aktion aus und erhalte Ergebnisse (inkl. `success`!)
                    reward, done, success = abm3.execute_decision_for_order(
                        order, action, couriers, current_time, constants, 
                        reward_calculator, state_handler, rejection_model, state_features=state_features
                    )
                    visualizer.record_decision(state_features, action, reward, action_probs)
                    
                    # Speichere Erfahrung und trainiere
                    agent.store_transition(state_features, action, reward, log_prob.numpy(), done)
                    episode_reward_sum += reward
                    decision_counter += 1
                    if decision_counter % TRAIN_INTERVAL == 0 and decision_counter > 0:
                        actor_loss, critic_loss, entropy = agent.train()
                        episode_metrics['actor_loss'].append(actor_loss)
                        episode_metrics['critic_loss'].append(critic_loss)
                        episode_metrics['entropy'].append(entropy)

            # 5. NEUE WARTESCHLANGEN-LOGIK: Queue für nächsten Schritt aktualisieren
            if not success:
                next_order_queue.append((order, attempts + 1))

        # Die aktualisierte Warteschlange für den nächsten Zeitschritt übernehmen
        order_queue = next_order_queue
                
        # 6. Zeit fortschreiten lassen
        current_time += constants['steps']

    # --- ENDE DER EPISODE ---
    # Trainiere mit den verbleibenden Daten im Speicher
    if len(agent.states) > 0:
        print("  -> Finales Training am Ende der Episode...")
        agent.train()
    
    del couriers
    del availability_history
    gc.collect()
    
    avg_reward_per_decision = episode_reward_sum / decision_counter if decision_counter > 0 else 0
    episode_metrics['rewards'].append(avg_reward_per_decision)
    # Berechne die Durchschnittswerte der neuen Metriken für die Ausgabe
    avg_actor_loss = np.mean(episode_metrics['actor_loss'][-decision_counter:]) if decision_counter > 0 else 0
    avg_critic_loss = np.mean(episode_metrics['critic_loss'][-decision_counter:]) if decision_counter > 0 else 0
    avg_entropy = np.mean(episode_metrics['entropy'][-decision_counter:]) if decision_counter > 0 else 0

    print(f"Episode {episode + 1} beendet. Avg Reward: {avg_reward_per_decision:.3f} | "
        f"Actor Loss: {avg_actor_loss:.3f} | Critic Loss: {avg_critic_loss:.3f} | "
        f"Entropy: {avg_entropy:.3f}\\n")
    
    plot_dir = "newplots"
    os.makedirs(plot_dir, exist_ok=True)
    if (episode + 1) % 5 == 1:
        visualizer.clear_history()

    if (episode + 1) % 50 == 0:
        print("\n--- Evaluierungs-Checkpoint ---") 

        if experienced_states_for_plotting: # Only plot if data has been collected
            fig, ax = plt.subplots(figsize=(10, 8))
            fig.suptitle('Learned Policy with Experienced States Overlay', fontsize=16)

            # Step A: Draw the policy heatmap exactly as before
            im = visualizer.visualize_unified_policy(ax)
            fig.colorbar(im, ax=ax, label="Probability of 'Split' (Action 1)")

            plt.tight_layout(rect=[0, 0.03, 1, 0.95])
            policy_plot_filename = os.path.join(plot_dir, f"policy_episode_{episode + 1}_experience.svg")
            fig.savefig(policy_plot_filename, format='svg', bbox_inches='tight')
            plt.show(fig)
        
        print("  -> Erstelle Plot zur Entscheidungsverteilung...")
        fig_dist = visualizer.visualize_decision_distribution()
        if fig_dist:
            dist_plot_filename = os.path.join(plot_dir, f"decision_dist_episode_{episode + 1}.svg")
            fig_dist.savefig(dist_plot_filename, format='svg', bbox_inches='tight' )
            plt.show(fig_dist)

        print(" -> Erstelle Plot zur Belohnung vs. Zuversicht...")
        fig_confidence = visualizer.visualize_reward_confidence()
        if fig_confidence:
            confidence_plot_filename = os.path.join(plot_dir, f"confidence_episode_{episode + 1}.svg")
            fig_confidence.savefig(confidence_plot_filename, format='svg', bbox_inches='tight' )
            plt.show()

        print(" -> Erstelle Plot zur Aktions-Wahrscheinlichkeit...")
        fig_probs = visualizer.visualize_action_probabilities()
        if fig_probs:
            probs_plot_filename = os.path.join(plot_dir, f"probability_episode_{episode + 1}.svg")
            fig_probs.savefig(probs_plot_filename, format='svg', bbox_inches='tight' )
            plt.show()


        # WICHTIG: Leere die History, damit die nächste Analyse frisch startet
        visualizer.clear_history()
        
        # Der Code für den Reward-Plot bleibt unverändert
        rewards = episode_metrics['rewards']

        # Erstelle ein Pandas DataFrame für einfache Berechnungen
        df = pd.DataFrame({'rewards': rewards})

        # Berechne den gleitenden Durchschnitt
        window_size = 30
        df['moving_average'] = df['rewards'].rolling(window=window_size).mean()

        # Erstelle den Plot
        plt.figure(figsize=(12, 6))
        plt.plot(df.index, df['rewards'], alpha=0.3, label='Belohnung pro Episode')
        plt.plot(df.index, df['moving_average'], color='red', linewidth=2, label=f'Gleitender Durchschnitt ({window_size} Episoden)')

        plt.title('Trainingsfortschritt: Belohnung über Episoden', fontsize=16)
        plt.xlabel('Episode', fontsize=12)
        plt.ylabel('Avg Reward per Decision', fontsize=12) # Angepasst für Klarheit
        plt.legend()
        plt.grid(True)
        reward_plot_filename = os.path.join(plot_dir, f"reward_episode_{episode + 1}.svg")
        plt.savefig(reward_plot_filename, format='svg', bbox_inches='tight')   
        plt.show()
    
    if (episode + 1) % 1 == 0:
        agent.save_models(f"final_ppo_agent_{episode + 1}") # Ruft die neue Speicherfunktion auf

print("Training abgeschlossen und finale Modelle gespeichert!")

--- Starte Episode 1/300 | Kuriere: 1142 ---


KeyboardInterrupt: 