In [1]:
import torch
import torch.nn as nn
import numpy as np
from collections import deque
import random
from tqdm import tqdm
from data import DataProcessor
from env import TradingEnv
from Rl import model, DeepQNetwork


ModuleNotFoundError: No module named 'torch'

In [None]:
class ReplayBuffer:
    def __init__(self, capacity=10000):
        self.buffer = deque(maxlen=capacity)
    
    def push(self, state, action, reward, next_state, done):
        self.buffer.append((state, action, reward, next_state, done))
    
    def sample(self, batch_size):
        state, action, reward, next_state, done = zip(*random.sample(self.buffer, batch_size))
        return (np.array(state), np.array(action), np.array(reward), 
                np.array(next_state), np.array(done))
    
    def __len__(self):
        return len(self.buffer)


In [None]:
device = 'cuda:1'
def train_agent(env, agent, num_episodes=1000, batch_size=64, gamma=0.99, 
                update_freq=10, min_buffer_size=1000):
    """
    训练智能体
    
    Args:
        env: 交易环境
        agent: DeepQNetwork实例
        num_episodes: 训练轮数
        batch_size: 批次大小
        gamma: 折扣因子
        update_freq: 更新频率
        min_buffer_size: 最小缓冲区大小
    """
    replay_buffer = ReplayBuffer()
    episode_rewards = []
    
    for episode in tqdm(range(num_episodes)):
        state = env.reset()
        episode_reward = 0
        done = False
        
        while not done:
            # 选择动作
            action = agent.choose_action(state)
            
            # 执行动作
            next_state, reward, done, info = env.step(action)
            
            # 存储经验
            replay_buffer.push(state, action, reward, next_state, done)
            
            state = next_state
            episode_reward += reward
            
            # 当缓冲区足够大时开始训练
            if len(replay_buffer) > min_buffer_size and len(replay_buffer) >= batch_size:
                # 从经验回放缓冲区采样
                states, actions, rewards, next_states, dones = replay_buffer.sample(batch_size)
                
                # 转换为tensor
                states = torch.FloatTensor(states).to(device)
                actions = torch.LongTensor(actions).to(device)
                rewards = torch.FloatTensor(rewards).to(device)
                next_states = torch.FloatTensor(next_states).to(device)
                dones = torch.FloatTensor(dones).to(device)
                
                # 计算当前状态的值和策略
                current_probs, current_values = agent.policy(states)
                
                # 计算下一状态的值
                with torch.no_grad():
                    _, next_values = agent.policy(next_states)
                    next_values = next_values.squeeze()
                
                # 计算目标值（TD target）
                target_values = rewards + gamma * next_values * (1 - dones)
                
                # 计算优势函数
                advantages = target_values - current_values.squeeze()
                
                # 计算策略损失（Actor损失）
                selected_probs = current_probs[range(batch_size), actions]
                actor_loss = -(torch.log(selected_probs) * advantages.detach()).mean()
                
                # 计算值函数损失（Critic损失）
                critic_loss = agent.mse_loss(current_values.squeeze(), target_values)
                
                # 总损失
                total_loss = actor_loss + 0.5 * critic_loss
                
                # 优化
                agent.optimizer.zero_grad()
                total_loss.backward()
                agent.optimizer.step()
                
                # 每隔一定步数保存模型
                if episode % update_freq == 0:
                    torch.save(agent.policy.state_dict(), 'policy.pth')
        
        episode_rewards.append(episode_reward)
        
        # 打印训练信息
        if episode % 10 == 0:
            avg_reward = np.mean(episode_rewards[-10:])
            print(f'Episode {episode}, Average Reward: {avg_reward:.2f}')
    
    return episode_rewards



In [None]:
env = TradingEnv()  # 你的交易环境
agent = DeepQNetwork(
    n_actions=3,  # [-1, 0, 1]
    n_features=54,  # 4个市场特征 + 50个因子
    learning_rate=0.001
)

# 开始训练
rewards = train_agent(
    env=env,
    agent=agent,
    num_episodes=10,
    batch_size=64,
    gamma=0.99,
    update_freq=10
)

# 保存最终模型
torch.save(agent.policy.state_dict(), 'policy_final.pth')