In [None]:
import gym
import datetime

from QLearning.agent import QLearning
from common.plot import plot_rewards
from common.utils import save_results,make_dir
curr_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") # obtain current time

In [None]:
class QlearningConfig:
    '''训练相关参数'''
    def __init__(self):
        self.algo = 'Qlearning'
        # self.env = 'CliffWalking-v0' # 0 up, 1 right, 2 down, 3 left
        # path to save results
        self.result_path = "/outputs/" + curr_time + '/results/'
        # path to save models
        self.model_path = "/outputs/" + curr_time + '/models/'
        
        self.train_eps = 200        # 训练的episode数目
        self.eval_eps = 30          # 评估的episode数目
        self.gamma = 0.9            # reward的衰减率
        self.epsilon_start = 0.95   # e-greedy策略中初始epsilon
        self.epsilon_end = 0.01     # e-greedy策略中的终止epsilon
        self.epsilon_decay = 200    # e-greedy策略中epsilon的衰减率
        self.lr = 0.1               # learning rate

In [None]:
# seed 表示环境的随机性
def env_agent_config(cfg, seed=1):
    # env = CliffWalkingWapper(env)
    env = 
    # Q Table 的行数
    state_dim = env.observation_space.n
    # Q Table 的列数
    action_dim = env.action_space.n
    agent = QLearning(state_dim, action_dim, cfg)
    return env,agent

In [None]:
def train(cfg, env, agent):
    rewards = []  
    ma_rewards = [] # moving average reward

    for i_ep in range(cfg.train_eps):
        
        # 记录每个episode的reward
        ep_reward = 0  
        # 重置环境, 重新开一局（即开始新的一个episode）
        state = env.reset()  

        while True:
            # 根据算法选择一个动作
            action = agent.choose_action(state)  
            # 与环境进行一次动作交互，获得该动作导致的下一个动作和回报
            next_state, reward, done, _ = env.step(action)
            # Q-learning 算法更新 agent
            agent.update(state, action, reward, next_state, done) 

            state = next_state  # 存储上一个观察值
            ep_reward += reward
            if done:
                break

        rewards.append(ep_reward)
        # 平滑回报曲线
        if ma_rewards:
            ma_rewards.append(ma_rewards[-1] * 0.9 + ep_reward * 0.1)
        else:
            ma_rewards.append(ep_reward)
        if (i_ep + 1) % 10 == 0:
            print("Episode:{}/{}: reward:{:.1f}".format(i_ep + 1, cfg.train_eps,ep_reward))

    return rewards, ma_rewards

In [None]:
cfg = QlearningConfig()
env,agent = env_agent_config(cfg, seed=1)
rewards,ma_rewards = train(cfg, env, agent)
make_dir(cfg.result_path,cfg.model_path)
agent.save(path=cfg.model_path)
save_results(rewards,ma_rewards,tag='train',path=cfg.result_path)
plot_rewards(rewards,ma_rewards,tag="train",env=cfg.env,algo = cfg.algo,path=cfg.result_path)