In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from collections import deque, defaultdict
import random
import matplotlib.pyplot as plt
from tqdm import tqdm

from IPython import get_ipython
from IPython import display
from dyna_env_drifttype import TaskEnv_driftype

In [21]:
class DQN(nn.Module):
    def __init__(self, state_size, action_size):
        super(DQN, self).__init__()
        self.fc1 = nn.Linear(state_size, 16)
        self.fc2 = nn.Linear(16, 16)
        self.fc3 = nn.Linear(16, action_size)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

class MetaDQNAgent:
    def __init__(self, env):
        self.env = env
        
        # Initialize state and action encoders
        self.state_encoder = StateActionEncoder(env.states)
        self.action_encoder = StateActionEncoder(env.motions)
        
        self.state_size = len(env.states)
        self.action_size = len(env.motions)
        
        self.memory = deque(maxlen=100) #online, so keep only small batch of memory
        self.gamma = 0.95    # discount rate
        self.epsilon = 0.2   # exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.batch_size = 4
        self.update_target_every = 50
        
        # Main network
        self.model = DQN(self.state_size, self.action_size)
        # Target network (for stability)
        self.target_model = DQN(self.state_size, self.action_size)
        self.target_model.load_state_dict(self.model.state_dict())
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        
        # Meta-learning components
        self.meta_optimizer = optim.Adam(self.model.parameters(), lr=0.0001)
        self.fast_weights = None
        self.meta_batch_size = 5
        self.meta_loss = None
        
        # Tracking
        self.rewards_history = []
        self.epsilons_history = []
        self.loss_history = []
        self.adaptation_scores = []
        # Tracking
        self.rewards_history = []
        self.epsilons_history = []
        self.loss_history = []
        self.adaptation_scores = []
        
        # For action masking
        self.valid_actions_cache = {}

    def remember(self, state, action, reward, next_state, done):
        state_enc = self.state_encoder.encode(state)
        action_idx = self.action_encoder.encode(action)
        next_state_enc = self.state_encoder.encode(next_state)
        self.memory.append((state_enc, action_idx, reward, next_state_enc, done))
        

    
    def get_valid_actions(self, state):
        """Cache valid actions for each state to handle action masking"""
        if state not in self.valid_actions_cache:
            # Get all possible actions from environment
            self.valid_actions_cache[state] = list(range(len(self.env.motions)))
        return self.valid_actions_cache[state]
    
    def act(self, state, training=True):
        valid_actions = self.get_valid_actions(state)
        
        if training and np.random.rand() <= self.epsilon:
            return random.choice(valid_actions)
        
        state_enc = self.state_encoder.encode(state)
        state_enc = torch.FloatTensor(state_enc).unsqueeze(0)
        
        with torch.no_grad():
            action_values = self.model(state_enc)
        
        # Convert to numpy and mask invalid actions
        action_values = action_values.squeeze().numpy()
        masked_values = -np.inf * np.ones_like(action_values)
        masked_values[valid_actions] = action_values[valid_actions]
        
        return np.argmax(masked_values)
    

    def compute_loss(self, batch, model):
            # 解包批次数据
        states, actions, rewards, next_states, dones = zip(*batch)  
    # 转为张量并确保正确形状
        states = torch.FloatTensor(np.array(states))  # shape: [batch_size, state_dim]
        next_states = torch.FloatTensor(np.array(next_states))  # shape: [batch_size, state_dim] 
    # 关键修正：处理actions维度
        actions = torch.LongTensor(actions)  # 先转为1D张量
        if actions.dim() == 1:
            actions = actions.unsqueeze(1)  # 转为[batch_size, 1]
        elif actions.dim() > 2:
            actions = actions.squeeze()  # 去除多余维度
        if actions.dim() == 1:
            actions = actions.unsqueeze(1)
    
        rewards = torch.FloatTensor(rewards).unsqueeze(1)  # [batch_size, 1]
        dones = torch.FloatTensor(dones).unsqueeze(1)      # [batch_size, 1]
        self._validate_shapes(states, actions, rewards, next_states, dones)
        # 获取Q值
        q_values = model(states)  # [batch_size, action_size]
        
        # 收集实际采取动作的Q值
        current_q = q_values.gather(1, actions)  # [batch_size, 1]
        
        # 计算目标Q值
        with torch.no_grad():
            next_q = self.target_model(next_states).max(1)[0].unsqueeze(1)
            target_q = rewards + (1 - dones) * self.gamma * next_q
        
        return F.mse_loss(current_q, target_q)

    
    def _validate_shapes(self, states, actions, rewards, next_states, dones):
        """验证所有张量的形状是否正确"""
        assert states.dim() == 2, f"States should be 2D, got {states.dim()}"
        assert actions.dim() == 2, f"Actions should be 2D, got {actions.dim()}"
        assert rewards.dim() == 2, f"Rewards should be 2D, got {rewards.dim()}"
        assert next_states.dim() == 2, f"Next states should be 2D, got {next_states.dim()}"
        assert dones.dim() == 2, f"Dones should be 2D, got {dones.dim()}"
        
    def replay(self):
        if len(self.memory) < self.batch_size:
            return 0
        
        # Sample batch from memory
        batch = random.sample(self.memory, self.batch_size)
        loss = self.compute_loss(batch, self.model)
        
        # Backpropagation
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        return loss.item()
    
    def meta_update(self):
        if len(self.memory) < self.meta_batch_size * self.batch_size:
            return 0
        
        # 保存原始权重
        original_weights = {k: v.clone() for k, v in self.model.named_parameters()}
        
        task_losses = []
        for _ in range(self.meta_batch_size):
            # 采样一个任务批次
            batch = random.sample(self.memory, self.batch_size)
            
            # 内循环：计算梯度并创建快速权重
            with torch.enable_grad():  # 确保计算梯度
                loss = self.compute_loss(batch, self.model)
                gradients = torch.autograd.grad(loss, self.model.parameters(), create_graph=True)
                
                # 创建快速权重（保持计算图连接）
                fast_weights = {
                    name: param - self.learning_rate * grad
                    for (name, param), grad in zip(self.model.named_parameters(), gradients)
                }
            
            # 计算快速权重下的损失（保持计算图）
            with torch.enable_grad():
                # 临时使用快速权重进行前向传播
                def fast_forward(x):
                    x = F.relu(F.linear(x, fast_weights['fc1.weight'], fast_weights['fc1.bias']))
                    x = F.relu(F.linear(x, fast_weights['fc2.weight'], fast_weights['fc2.bias']))
                    return F.linear(x, fast_weights['fc3.weight'], fast_weights['fc3.bias'])
                
                task_loss = self.compute_loss(batch, fast_forward)
                task_losses.append(task_loss)
        
        # 外循环：元更新
        meta_loss = torch.mean(torch.stack(task_losses))
        self.meta_optimizer.zero_grad()
        meta_loss.backward()
        self.meta_optimizer.step()
        
        return meta_loss.item()
    
    def train(self, episodes, drift_episode=None):
        for e in tqdm(range(episodes)):
            # Apply drift at specified episode
            if drift_episode is not None and e == drift_episode:
                self.apply_drift()
            
            state = self.env.reset()
            total_reward = 0
            done = False
            
            while not done:
                action_idx = self.act(state)
                action = self.env.motions[action_idx]  # Convert index back to action string
                next_state, reward, done, _ = self.env.step(action_idx)
                
                self.remember(state, action, reward, next_state, done)
                state = next_state
                total_reward += reward
                
                # Standard replay
                loss = self.replay()
                if loss:
                    self.loss_history.append(loss)
                
                # Meta-update less frequently
                if e % 10 == 0:
                    self.meta_loss= self.meta_update()
            
            # Update target network periodically
            if e % self.update_target_every == 0:
                self.target_model.load_state_dict(self.model.state_dict())
            
            # Decay epsilon
            if self.epsilon > self.epsilon_min:
                self.epsilon *= self.epsilon_decay
            
            # Track progress
            self.rewards_history.append(total_reward)
            self.epsilons_history.append(self.epsilon)
            
            # Evaluate adaptation every 50 episodes
            if e % 50 == 0:
                adaptation_score = self.evaluate_adaptation()
                self.adaptation_scores.append(adaptation_score)
                print(f"Episode: {e}, Reward: {total_reward:.2f}, Adapt Score: {adaptation_score:.2f}, Epsilon: {self.epsilon:.2f}")


                
    def apply_drift(self):
        """Handle environment drift"""
        print("\nApplying environment drift...")
        self.env.set_flag()
        self.env.drift(add_actions=0,change_at_states=['pp'],drift_dis_type='random')#only transition metrix change
        
        # Update encoders with new states/actions
        old_state_encoder = self.state_encoder
        old_action_encoder = self.action_encoder
        
        self.state_encoder = StateActionEncoder(self.env.states)
        self.action_encoder = StateActionEncoder(self.env.motions)
        
        # Create new networks with updated sizes
        old_model = self.model
        self.state_size = len(self.env.states)
        self.action_size = len(self.env.motions)
        
        self.model = DQN(self.state_size, self.action_size)
        self.target_model = DQN(self.state_size, self.action_size)
        
        # Initialize new layers with old weights where possible
        self._transfer_weights(old_model, old_state_encoder, old_action_encoder)
        
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        self.meta_optimizer = optim.Adam(self.model.parameters(), lr=0.0001)
        
        # Reset valid actions cache
        self.valid_actions_cache = {}
        
        print(f"Updated to {self.state_size} states and {self.action_size} actions")
    
    def _transfer_weights(self, old_model, old_state_encoder, old_action_encoder):
        """Transfer weights from old model to new model, handling dimension changes"""
        # Create mapping from old to new indices
        state_mapping = self._create_mapping(old_state_encoder.vocab, self.state_encoder.vocab)
        action_mapping = self._create_mapping(old_action_encoder.vocab, self.action_encoder.vocab)
        
        # Transfer weights layer by layer
        with torch.no_grad():
            # FC1 layer (input is state)
            self._transfer_fc_layer(old_model.fc1, self.model.fc1, state_mapping, None)
            
            # FC2 layer (no dimension changes)
            if old_model.fc2.weight.shape == self.model.fc2.weight.shape:
                self.model.fc2.load_state_dict(old_model.fc2.state_dict())
            
            # FC3 layer (output is action)
            self._transfer_fc_layer(old_model.fc3, self.model.fc3, None, action_mapping)
    
    def _transfer_fc_layer(self, old_layer, new_layer, input_mapping, output_mapping):
        """Helper function to transfer weights for a fully connected layer"""
        old_weights = old_layer.weight.data
        old_bias = old_layer.bias.data if old_layer.bias is not None else None
        
        new_weights = torch.zeros_like(new_layer.weight.data)
        new_bias = torch.zeros_like(new_layer.bias.data) if new_layer.bias is not None else None
        
        # Handle input dimension mapping
        if input_mapping is not None:
            for new_in, old_in in input_mapping.items():
                new_weights[:, new_in] = old_weights[:, old_in]
        else:
            min_in = min(old_weights.shape[1], new_weights.shape[1])
            new_weights[:, :min_in] = old_weights[:, :min_in]
        
        # Handle output dimension mapping
        if output_mapping is not None:
            for new_out, old_out in output_mapping.items():
                new_weights[new_out, :] = old_weights[old_out, :]
                if new_bias is not None:
                    new_bias[new_out] = old_bias[old_out]
        else:
            min_out = min(old_weights.shape[0], new_weights.shape[0])
            new_weights[:min_out, :] = old_weights[:min_out, :]
            if new_bias is not None:
                new_bias[:min_out] = old_bias[:min_out]
        
        new_layer.weight.data = new_weights
        if new_bias is not None:
            new_layer.bias.data = new_bias
    
    def _create_mapping(self, old_vocab, new_vocab):
        """Create mapping from old to new indices for common elements"""
        mapping = {}
        for new_idx, item in enumerate(new_vocab):
            if item in old_vocab:
                old_idx = old_vocab.index(item)
                mapping[new_idx] = old_idx
        return mapping
    
    def evaluate_adaptation(self, num_episodes=10):
        """Evaluate agent's current performance"""
        total_rewards = 0
        for _ in range(num_episodes):
            state = self.env.reset()
            episode_reward = 0
            done = False
            
            while not done:
                action_idx = self.act(state, training=False)
                next_state, reward, done, _ = self.env.step(action_idx)
                state = next_state
                episode_reward += reward
            
            total_rewards += episode_reward
        
        return total_rewards / num_episodes
    
    def plot_training(self, show_window=50):
        plt.figure(figsize=(15, 5))
        
        # 1. Plot rewards
        plt.subplot(1, 3, 1)
        
        # Ensure we have enough episodes for moving average
        if len(self.rewards_history) >= show_window:
            moving_avg = np.convolve(self.rewards_history, np.ones(show_window)/show_window, mode='valid')
            # Correct x-axis coordinates for moving average
            x_ma = np.arange(show_window-1, len(self.rewards_history))
            plt.plot(x_ma, moving_avg, 'r-', linewidth=2, label=f'MA({show_window})')
        
        # Plot raw rewards
        plt.plot(self.rewards_history, alpha=0.3, label='Raw')
        plt.title('Rewards per Episode')
        plt.xlabel('Episode')
        plt.ylabel('Total Reward')
        plt.legend()
        
        # 2. Plot epsilon decay
        plt.subplot(1, 3, 2)
        plt.plot(self.epsilons_history)
        plt.title('Epsilon Decay')
        plt.xlabel('Episode')
        plt.ylabel('Epsilon')
        
        # 3. Plot adaptation scores
        plt.subplot(1, 3, 3)
        if len(self.adaptation_scores) > 0:
            # Calculate correct x-axis positions for adaptation scores
            eval_every = len(self.rewards_history) / len(self.adaptation_scores)
            x = np.arange(0, len(self.rewards_history), eval_every)
            # Ensure we don't have more points than scores
            x = x[:len(self.adaptation_scores)]
            plt.plot(x, self.adaptation_scores, 'g-', marker='o')
            plt.title('Adaptation Scores')
            plt.xlabel('Episode')
            plt.ylabel('Avg Reward (eval)')
        
        plt.tight_layout()
        plt.show()

class StateActionEncoder:
    """Helper class to handle one-hot encoding of states and actions"""
    def __init__(self, items):
        self.vocab = sorted(items)
        self.item_to_idx = {item: idx for idx, item in enumerate(self.vocab)}
    
    def encode(self, item):
        """Convert item to one-hot encoding"""
        if item not in self.item_to_idx:
            # Handle unknown items (shouldn't happen in our case)
            return np.zeros(len(self.vocab))
        encoding = np.zeros(len(self.vocab))
        encoding[self.item_to_idx[item]] = 1
        #print(self.item_to_idx[item])
        #print(encoding)
        return encoding
    
    def decode(self, encoding):
        """Convert one-hot encoding back to item"""
        idx = np.argmax(encoding)
        return self.vocab[idx]
    



In [4]:
import numpy as np
import pandas as pd
import plotly.express as px
from collections import defaultdict

def run_multiple_training(env_class, agent_class, episodes=2000, drift_episode=None, runs=2):
    all_rewards = []

    for _ in range(runs):
        env = env_class()
        agent = agent_class(env)
        agent.train(episodes=episodes, drift_episode=drift_episode)
        all_rewards.append(agent.rewards_history)

    # 平均化
    avg_rewards = np.mean(all_rewards, axis=0)
    return avg_rewards

def smooth_rewards(rewards, window=20):
    return pd.Series(rewards).rolling(window=window).mean().tolist()

def compare_reward(r, rdrif):
    data_eps = {
        'Without Drift': smooth_rewards(r),
        'With Drift': smooth_rewards(rdrif)
    }

    df_eps = pd.DataFrame(data=data_eps)
    fig = px.line(df_eps, y=['Without Drift', 'With Drift'], title='Meta-DQN with Sudden Drift (Avg of 5 runs)')
    fig.update_layout(xaxis_title='Episodes', yaxis_title='Average Episodic Reward')
    fig.show()

# === Run and Compare ===
# 1. No Drift
r = run_multiple_training(TaskEnv_driftype, MetaDQNAgent, episodes=2000, drift_episode=None, runs=5)

# 2. With Drift at Episode 400
rdrif = run_multiple_training(TaskEnv_driftype, MetaDQNAgent, episodes=2000, drift_episode=400, runs=5)

# 3. Compare and Plot
compare_reward(r, rdrif)


NameError: name 'MetaDQNAgent' is not defined

In [6]:
import plotly.express as px
def compare_reward(r,rdrif):
    data_eps = {'without': r, 
            'drift': rdrif}
    
    df_eps = pd.DataFrame(data=data_eps)
    print(df_eps.shape)
    print(data_eps)
    fug = px.line(data_eps, y=['without','drift'],title='Meta-DQN with sudden drift')
    fug.update_layout(xaxis_title = 'Episodes', yaxis_title = 'Average sum of reward')
    fug.show()

In [5]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
from collections import deque

class MetaMaskedDQN(nn.Module):
    def __init__(self, state_dim, action_dim, hidden_dim=32):
        super(MetaMaskedDQN, self).__init__()
        # 主网络
        self.fc1 = nn.Linear(state_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, action_dim)
        
        # 元学习相关参数
        self.fast_weights = None  # 用于存储快速适应的权重
        self.meta_lr = 0.1  # 元学习率，用于内循环更新
        
    def forward(self, x, mask=None, params=None):
        if params is None:
            params = dict(self.named_parameters())
            
        x = F.relu(F.linear(x, params['fc1.weight'], params['fc1.bias']))
        x = F.relu(F.linear(x, params['fc2.weight'], params['fc2.bias']))
        q_values = F.linear(x, params['fc3.weight'], params['fc3.bias'])
        
        if mask is not None:
            q_values = q_values.masked_fill(~mask, float('-inf'))
        return q_values
    
    def clone_state(self):
        """克隆当前网络状态，用于元学习的内循环"""
        self.fast_weights = {k: v.clone() for k, v in self.named_parameters()}
        
    def adapt(self, loss):
        """在内循环中执行一步梯度下降"""
        grads = torch.autograd.grad(loss, self.fast_weights.values(), create_graph=True)
        self.fast_weights = {k: v - self.meta_lr * g for (k, v), g in zip(self.fast_weights.items(), grads)}

class MetaMaskedDQNAgent:
    def __init__(self, env, state_dim=5, gamma=0.99, lr=1e-3, 
                 epsilon_start=1.0, epsilon_end=0.01, epsilon_decay=0.995,
                 meta_batch_size=5, num_meta_updates=5):
        self.env = env
        self.state_dim = state_dim
        self.action_dim = env.action_space.n
        self.gamma = gamma
        self.epsilon = epsilon_start
        self.epsilon_end = epsilon_end
        self.epsilon_decay = epsilon_decay
        
        # 元学习相关参数
        self.meta_batch_size = meta_batch_size  # 每个任务采样的episode数量
        self.num_meta_updates = num_meta_updates  # 内循环更新次数
        
        # 使用两个网络：一个用于元学习（快速适应），一个用于常规学习
        self.meta_net = MetaMaskedDQN(state_dim, self.action_dim)
        self.policy_net = MetaMaskedDQN(state_dim, self.action_dim)
        self.target_net = MetaMaskedDQN(state_dim, self.action_dim)
        
        self.target_net.load_state_dict(self.policy_net.state_dict())
        self.optimizer = torch.optim.Adam(self.policy_net.parameters(), lr=lr)
        
        self.update_target_every = 100
        self.steps_done = 0
        self.memory = deque(maxlen=10000)
        self.batch_size = 1
        
    def get_state_representation(self, state):
        """将离散状态转换为5维one-hot编码"""
        state_index = ['va','sib','pp','po','Tau'].index(state)
        one_hot = np.zeros(5)
        one_hot[state_index] = 1
        return torch.FloatTensor(one_hot)
    
    def remember(self, state, action, reward, next_state, done, mask):
        self.memory.append((
            self.get_state_representation(state),
            action,
            reward,
            self.get_state_representation(next_state),
            done,
            torch.BoolTensor(mask)
        ))
    
    def act(self, state, mask, training=True, fast_adapt=False):
        if training and random.random() < self.epsilon:
            valid_actions = [i for i, m in enumerate(mask) if m]
            return random.choice(valid_actions)
        else:
            with torch.no_grad():
                state_tensor = self.get_state_representation(state)
                mask_tensor = torch.BoolTensor(mask)
                
                if fast_adapt:
                    # 使用快速适应的权重进行决策
                    q_values = self.meta_net(state_tensor.unsqueeze(0), mask_tensor.unsqueeze(0), 
                                           params=self.meta_net.fast_weights)
                else:
                    # 使用常规策略网络进行决策
                    q_values = self.policy_net(state_tensor.unsqueeze(0), mask_tensor.unsqueeze(0))
                
                return q_values.argmax().item()
    
    def meta_update(self):
        """执行元更新（外循环）"""
        if len(self.memory) < self.meta_batch_size:
            return
        
        # 1. 采样一批任务/episodes
        episodes = random.sample(self.memory, self.meta_batch_size)
        
        # 2. 初始化元梯度
        meta_loss = 0
        
        for episode in episodes:
            # 3. 克隆网络状态（内循环开始）
            self.meta_net.load_state_dict(self.policy_net.state_dict())
            self.meta_net.clone_state()
            
            # 4. 内循环适应（在单个episode上快速适应）
            states, actions, rewards, next_states, dones, masks = episode
            
            for _ in range(self.num_meta_updates):
                # 计算当前episode的损失
                current_q = self.meta_net(states.unsqueeze(0), masks.unsqueeze(0), 
                                        params=self.meta_net.fast_weights)
                current_q = current_q.gather(1, torch.LongTensor([actions]).unsqueeze(1))
                
                with torch.no_grad():
                    next_q = self.target_net(next_states.unsqueeze(0), masks.unsqueeze(0))
                    next_q = next_q.max(1)[0]
                    target_q = rewards + (1 - dones) * self.gamma * next_q
                
                loss = F.mse_loss(current_q.squeeze(), target_q)
                
                # 在内循环中执行一步梯度下降
                self.meta_net.adapt(loss)
            
            # 5. 计算适应后的损失，用于元梯度
            adapted_q = self.meta_net(states.unsqueeze(0), masks.unsqueeze(0), 
                                    params=self.meta_net.fast_weights)
            adapted_q = adapted_q.gather(1, torch.LongTensor([actions]).unsqueeze(1))
            meta_loss += F.mse_loss(adapted_q.squeeze(), target_q)
        
        # 6. 外循环更新（元更新）
        self.optimizer.zero_grad()
        meta_loss.backward()
        self.optimizer.step()
        
        return meta_loss.item()
    
    def replay(self):
        """常规经验回放"""
        if len(self.memory) < self.batch_size:
            return
        
        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones, masks = zip(*batch)
        
        states = torch.stack(states)
        actions = torch.LongTensor(actions)
        rewards = torch.FloatTensor(rewards)
        next_states = torch.stack(next_states)
        dones = torch.FloatTensor(dones)
        masks = torch.stack(masks)
        
        current_q = self.policy_net(states, masks).gather(1, actions.unsqueeze(1))
        
        with torch.no_grad():
            next_q = self.target_net(next_states, masks).max(1)[0]
            target_q = rewards + (1 - dones) * self.gamma * next_q
        
        loss = F.mse_loss(current_q.squeeze(), target_q)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        
        if self.steps_done % self.update_target_every == 0:
            self.target_net.load_state_dict(self.policy_net.state_dict())
        
        self.steps_done += 1
        self.update_epsilon()
        
        return loss.item()
    
    def update_epsilon(self):
        self.epsilon = max(self.epsilon_end, 
                          self.epsilon * self.epsilon_decay)

In [6]:
import dyna_env_acdy
from dyna_env_acdy import TaskEnv_actionD
env = TaskEnv_actionD()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  frequencies[label][action] = ast.literal_eval(frequencies[label][action])


In [22]:
def train_with_drifts(env, agent, episodes=1000, drift_episodes=[300, 600]):
    rewards_history = []
    meta_loss_history = []
    action_info_history = []
    steps_history = []
    
    for episode in range(episodes):
        state = env.reset()
        done = False
        total_reward = 0
        step_count = 0
        episode_meta_losses = []
        
        # 环境漂移处理（完全保留原始逻辑）
        if episode in drift_episodes:
            drift_idx = drift_episodes.index(episode)
            
            if drift_idx == 0:
                # 第一次drift：禁用2个特定动作
                env.set_flag()
                env.drift(add_actions=-2, 
                         drift_type='sudden',
                         disable_actions=['client afgeleid', 'naar andere kamer/ruimte gestuurd'])
            else:
                # 第二次drift：添加2个新动作
                env.set_flag()
                env.drift(add_actions=2, 
                         drift_type='sudden')
            
            print(f"\n=== Drift {drift_idx+1} Applied at Episode {episode} ===")
            action_info = env.get_action_info()
            print("Action info after drift:")
            print(f"Total motions: {len(env.motions)}")
            print(f"Valid actions: {action_info['valid_actions']}")
            print(f"Disabled actions: {action_info['disabled_action_names']}")
            
            # 重置目标网络以加速适应新环境
            agent.target_net.load_state_dict(agent.policy_net.state_dict())
            
            # 记录动作空间变化
            action_info_history.append((episode, action_info))
        
        while not done:
            step_count += 1
            # 获取当前动作掩码
            mask = env.get_action_mask()
            
            # 选择动作（使用元学习快速适应后的策略）
            use_meta_weights = (episode in drift_episodes)  # 在漂移发生时使用元学习权重
            action = agent.act(state, mask, fast_adapt=use_meta_weights)
            
            # 执行动作（保留原始错误处理）
            try:
                next_state, reward, done, _ = env.step(action)
            except ValueError as e:
                print(f"Invalid action error at episode {episode}: {e}")
                print(f"Attempted action: {action}, Valid actions: {env.get_valid_actions()}")
                break
            
            # 存储经验
            agent.remember(state, action, reward, next_state, done, mask)
            
            # 常规经验回放
            loss = agent.replay()
            
            # 元学习训练（在漂移发生后加强元学习）
            if (episode in drift_episodes) or (episode % 10 == 0):  # 漂移时或定期执行
                meta_loss = agent.meta_update()
                if meta_loss is not None:
                    episode_meta_losses.append(meta_loss)
            
            state = next_state
            total_reward += reward
        
        # 记录训练数据
        rewards_history.append(total_reward)
        steps_history.append(step_count)
        if episode_meta_losses:
            meta_loss_history.append(np.mean(episode_meta_losses))
        
        # 训练进度输出（增强版）
        if episode % 50 == 0 or episode in drift_episodes:
            # 计算滑动平均
            avg_reward = np.mean(rewards_history[-50:]) if episode >= 50 else np.mean(rewards_history)
            avg_steps = np.mean(steps_history[-50:]) if episode >= 50 else np.mean(steps_history)
            current_mask = env.get_action_mask()
            
    
            
            print(f"Reward: {total_reward:6.2f} (Avg: {avg_reward:6.2f}) | "
                  f"Steps: {step_count:3d} (Avg: {avg_steps:4.1f}) | "
                  f"Eps: {agent.epsilon:.3f} | "
                  f"Actions: {sum(current_mask)}/{len(current_mask)} | "
                  f"Meta Loss: {np.mean(episode_meta_losses[-5:]) if episode_meta_losses else 0:.4f}")
    
    return {
        'rewards': rewards_history,
        'meta_losses': meta_loss_history,
        'action_info': action_info_history,
        'steps': steps_history
    }

# 增强版可视化函数
def plot_meta_training_results(results, window=20):
   # print(len(results['meta_losses']))
    df = pd.DataFrame({
        'Episode': range(len(results['rewards'])),
        'Reward': results['rewards'],
        'Smoothed Reward': smooth_rewards(results['rewards'], window),
        'Meta Loss': results['meta_losses'] + [np.nan] * (len(results['rewards']) - len(results['meta_losses'])),
        'Steps': results['steps']
    })

       # print(len(results['meta_losses']))
    dfmeta = pd.DataFrame({
        'metaEpisode': range(int(len(results['rewards'])/10-1)),
        'Meta Loss': results['meta_losses']
    })
    
    # 标注漂移发生点
    drift_marks = [ep for ep, _ in results['action_info']]
    
    # 奖励曲线
    fig1 = px.line(df, x='Episode', y=['Reward', 'Smoothed Reward'],
                  title='MetaMaskedDQN Training Performance')
    for d in drift_marks:
        fig1.add_vline(x=d, line_dash="dash", line_color="red")
    fig1.show()
    
    # 元学习损失曲线
    fig2 = px.line(dfmeta, x='metaEpisode', y='Meta Loss',
                  title='Meta-Learning Loss (Lower is Better)')
    for d in drift_marks:
        fig2.add_vline(x=d/10, line_dash="dash", line_color="red")
    fig2.show()
    
    # 步数曲线
    fig3 = px.line(df, x='Episode', y='Steps',
                  title='Episode Length')
    for d in drift_marks:
        fig3.add_vline(x=d, line_dash="dash", line_color="red")
    fig3.show()

# 使用示例
if __name__ == "__main__":
    env = TaskEnv_actionD()  # 需要实现get_action_mask()等方法
    agent = MetaMaskedDQNAgent(env)
    
    # 训练（在300和600episode发生环境漂移）
    results = train_with_drifts(env, agent, episodes=2100, drift_episodes=[700, 1400])
    
    # 可视化结果
    plot_meta_training_results(results)


ChainedAssignmentError: behaviour will change in pandas 3.0!
You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Using a target size (torch.Size([1])) that is different to the input size (torch.Size([])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.


Using a targ

{'Tau': 0.5496974935177182, 'va': 0.25842696629213485, 'sib': 0.020743301642178046, 'pp': 0.1495246326707001, 'po': 0.021607605877268798}
{'pp': 0.21645021645021645, 'va': 0.16883116883116883, 'Tau': 0.5497835497835498, 'po': 0.03463203463203463, 'sib': 0.030303030303030304}
Reward:  -2.00 (Avg:  -2.00) | Steps:   2 (Avg:  2.0) | Eps: 0.990 | Actions: 7/20 | Meta Loss: 0.0000
{'Tau': 0.4915254237288136, 'va': 0.07203389830508475, 'pp': 0.19915254237288135, 'po': 0.19915254237288135, 'sib': 0.038135593220338986}
{'Tau': 0.42410714285714285, 'pp': 0.40625, 'va': 0.09598214285714286, 'sib': 0.05133928571428571, 'po': 0.022321428571428572}
{'pp': 0.1927710843373494, 'sib': 0.1566265060240964, 'va': 0.13253012048192772, 'Tau': 0.4819277108433735, 'po': 0.03614457831325301}
{'pp': 0.36347358997314233, 'Tau': 0.5004476275738585, 'po': 0.021486123545210387, 'va': 0.09489704565801253, 'sib': 0.019695613249776187}
{'Tau': 0.4945054945054945, 'pp': 0.1978021978021978, 'va': 0.13186813186813187, '


Using a target size (torch.Size([1])) that is different to the input size (torch.Size([])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.


Using a target size (torch.Size([1])) that is different to the input size (torch.Size([])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.


Using a target size (torch.Size([1])) that is different to the input size (torch.Size([])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.



Reward:  -9.00 (Avg:  -0.92) | Steps:   3 (Avg:  1.8) | Eps: 0.010 | Actions: 7/20 | Meta Loss: 598.3884
{'va': 0.0, 'sib': 0.018807116322687605, 'pp': 0.4100268801941911, 'po': 0.0, 'Tau': 0.5711660034831214}
{'va': 0.0, 'sib': 0.013346201755036091, 'pp': 0.4053198048712896, 'po': 0.2903642681820308, 'Tau': 0.29096972519164355}
{'va': 0.0, 'sib': 0.018807116322687605, 'pp': 0.4100268801941911, 'po': 0.0, 'Tau': 0.5711660034831214}
{'va': 0.0, 'sib': 0.018807116322687605, 'pp': 0.4100268801941911, 'po': 0.0, 'Tau': 0.5711660034831214}
{'va': 0.22783917374835463, 'sib': 0.21696604827773336, 'pp': 0.2273650803535196, 'po': 0.010450529100728561, 'Tau': 0.31737916851966397}
{'va': 0.22783917374835463, 'sib': 0.21696604827773336, 'pp': 0.2273650803535196, 'po': 0.010450529100728561, 'Tau': 0.31737916851966397}
{'va': 0.0, 'sib': 0.018807116322687605, 'pp': 0.4100268801941911, 'po': 0.0, 'Tau': 0.5711660034831214}
{'va': 0.22783917374835463, 'sib': 0.21696604827773336, 'pp': 0.22736508035351

In [None]:
listq = [1,2,3,4,5,6]
len(listq)/2


3.0