In [1]:

from IPython import get_ipython
import random
import matplotlib.pyplot as plt
from IPython import display
from tqdm.notebook import tqdm
from typing import Tuple, List
import itertools as it
import numpy as np
import pandas as pd
import plotly.express as px
import statistics as s

In [2]:
from dynamic_env import TaskEnv_drift
env = TaskEnv_drift()

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  frequencies[label][action] = ast.literal_eval(frequencies[label][action]) #判断需要计算的内容是不是合法的Python类型，如果是则执行，否则就报错


In [6]:
env.set_flag()
env.drift(add_states=5,add_actions=5)
env.action_space


drift happen
['a0', 'a1', 'a2', 'a3', 'a4']


Discrete(17)

In [7]:
class AdaptiveQLearningAgent:
    def __init__(self,
                 env: TaskEnv_drift,
                 exploration_rate: float = 0.1,
                 learning_rate: float = 0.2,
                 discount_factor: float = 1) -> None:
        self.epsilon = exploration_rate
        self.alpha = learning_rate
        self.gamma = discount_factor
        
        # 使用字典替代固定numpy数组
        self.q_table = {}
        self.visit_counts = {}  # 跟踪状态-动作对的访问次数
        
        # 记录已知的状态和动作
        self.known_states = set(env.states)
        self.known_actions = set(env.motions)
        
        self.actions = env.motions
        
    def _get_q_value(self, state, action):
        """获取Q值，如果是新的状态-动作对则初始化"""
        key = (state, action)
        if key not in self.q_table:
            self.q_table[key] = 0.0  # 可以调整初始值策略
            self.visit_counts[key] = 0
        return self.q_table[key]
    
    def _set_q_value(self, state, action, value):
        """设置Q值"""
        key = (state, action)
        self.q_table[key] = value
        
    def detect_new_state_action(self, state, available_actions):
        """检测和处理新的状态或动作"""
        is_expanded = False
        
        # 检测新状态
        if state not in self.known_states and state != 'Tau':
            self.known_states.add(state)
            is_expanded = True
            
        # 检测新动作
        for action in self.actions:
            if action not in self.known_actions:
                self.known_actions.add(action)
                is_expanded = True
                
        # 如果发生扩展，调整探索策略
        if is_expanded:
            self._adjust_exploration()
            
        return is_expanded
    
    def _adjust_exploration(self):
        """调整探索率"""
        # 发现新状态或动作时暂时提高探索率
        self.epsilon = min(0.5, self.epsilon * 1.5)
        
    def select_action(self, state, use_greedy_strategy: bool = False) -> int:
        # 检测新状态和动作
        self.detect_new_state_action(state, self.actions)
        
        if not use_greedy_strategy and random.random() < self.epsilon:
            # 优先探索较少访问的动作
            action_visits = {a: self.visit_counts.get((state, a), 0) 
                           for a in range(len(self.actions))}
            min_visits = min(action_visits.values())
            least_visited = [a for a, v in action_visits.items() 
                           if v == min_visits]
            return np.random.choice(least_visited)
            
        # 获取所有动作的Q值
        q_values = [self._get_q_value(state, a) for a in range(len(self.actions))]
        max_val = max(q_values)
        max_actions = [i for i, q in enumerate(q_values) if q == max_val]
        return np.random.choice(max_actions)

    def learn(self, state, action, next_state, reward, done):
        # 更新访问计数
        key = (state, action)
        self.visit_counts[key] = self.visit_counts.get(key, 0) + 1
        
        # 获取下一状态的最大Q值
        next_max_val = 0
        if next_state != 'Tau':
            # 检测新状态
            self.detect_new_state_action(next_state, self.actions)
            next_q_values = [self._get_q_value(next_state, a) 
                           for a in range(len(self.actions))]
            next_max_val = max(next_q_values)
            
        # 更新Q值
        current_q = self._get_q_value(state, action)
        new_q = current_q + self.alpha * (reward + self.gamma * next_max_val - current_q)
        self._set_q_value(state, action, new_q)
        
        # 动态调整学习参数
        self._adjust_learning_parameters(state, action)
        
    def _adjust_learning_parameters(self, state, action):
        """根据访问次数动态调整学习参数"""
        visits = self.visit_counts.get((state, action), 0)
        
        # 随访问次数降低学习率
        self.alpha = max(0.01, 1.0 / (1 + visits * 0.1))
        
        # 如果所有状态-动作对都被充分访问，降低探索率
        if min(self.visit_counts.values()) > 10:
            self.epsilon = max(0.01, self.epsilon * 0.995)

In [8]:
def drift_control(add_states=2,add_actions=2, type_drif=None): #add other variable to control the type
    """which episode drift happen and which type""" 
    env.set_flag()
    env.drift(add_states,add_actions)
    #qlearner.change_qtable()
    return

In [15]:
# 初始化
def run_qlearner(nrofepisodes, gamma, epsilon = 0.1, alpha=0.2, train = True, agent = None, inform=False,drift_ep=0):
    #add param control whether drift happen
    if agent == None:
        Q_learner = AdaptiveQLearningAgent(env, discount_factor=gamma, exploration_rate=epsilon, learning_rate=alpha)
    else:
        Q_learner = agent
    state = env.reset()

    information = {}
    
    rewards_ = []

    for i in range(nrofepisodes):
        terminated = False
        sumRreward = 0
        
        if i == drift_ep and drift_ep!=0: #when at episode 10 data drift 只drift一次
            print("whether drift happen at",i,"episode")
            drift_control()
        while not terminated: 
            if train:
                action = Q_learner.select_action(state)
            else:
                action = Q_learner.select_action(state, use_greedy_strategy=True)

            if inform:
                observation, reward, terminated, info = env.step(action, inform)
                if info != []:
                    key = ''.join(str(x) for x in info)
                    if key not in information:
                        information[key] = 1
                    else:
                        information[key] += 1
            else:
            # print("action index", action)
                observation, reward, terminated, info = env.step(action)    
            
            sumRreward += reward
            
            if train:
                Q_learner.learn(state, action, observation, reward, terminated)    
            
            state = observation
            i+=1
            
        """if env.flag == True:
            state = env.reset_drift()
        else:"""
        state = env.reset()
        rewards_.append(sumRreward)
    return Q_learner, rewards_, information

In [24]:

agent1_learner, agent1_reward, agent1_info = run_qlearner(10, 0.2, epsilon=0.1, alpha=0.2,drift_ep=2)

whether drift happen at 2 episode
drift happen
['a0', 'a1']


In [25]:
env.states, agent1_reward

(['va', 'sib', 'pp', 'po', 'Tau', 's0', 's1'],
 [-1.0, 0.0, -19.0, -8.0, -6.0, -13.0, 0.0, -8.0, -9.0, 1.0])

In [26]:
env2 = TaskEnv_drift()
agent2_learner, agent2_reward, agent2_info = run_qlearner(10, 0.2, epsilon=0.1, alpha=0.2,drift_ep=0)

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  frequencies[label][action] = ast.literal_eval(frequencies[label][action]) #判断需要计算的内容是不是合法的Python类型，如果是则执行，否则就报错


In [22]:
env2.states, agent2_reward

(['va', 'sib', 'pp', 'po', 'Tau'],
 [0.0, 1.0, -4.0, 0.0, -3.0, 0.0, -12.0, -22.0, -9.0, -2.0])

In [14]:
env.states

['va', 'sib', 'pp', 'po', 'Tau']