In [22]:
import random
import numpy as np
import gymnasium as gym
import torch
import torch.nn as nn
import torch.optim as optim
from collections import deque
import yfinance as yf
import pandas as pd

# --- Load Historical Data ---
def load_stock_data(tickers, start="2020-01-01", end="2024-12-31"):
    data = yf.download(tickers, start=start, end=end)["Close"]
    #returns = data.pct_change().dropna()
    return data

# --- Custom Gymnasium Environment ---
class StockEnv(gym.Env):
    def __init__(self, stocks, actions, returns_df, window_size=45):
        super(StockEnv, self).__init__()
        self.stocks = stocks
        self.actions = actions
        self.returns_df = returns_df
        self.dates = returns_df.index.tolist()
        self.window_size = window_size
        self.action_space = gym.spaces.Discrete(len(stocks) * len(actions))
        self.observation_space = gym.spaces.Box(
            low=-1, high=1, shape=(window_size, len(stocks)), dtype=np.float32
        )
        self.reset()

    def reset(self, seed=None, options=None):
        super().reset(seed=seed)
        self.start_idx = random.randint(0, len(self.returns_df) - 2 * self.window_size)
        self.current_step = self.start_idx + self.window_size
        self.episode_reward = 0
        self.done = False
        return self._get_observation(), {}

    def _get_observation(self):
        window = self.returns_df.iloc[self.current_step - self.window_size:self.current_step].values
        return window.astype(np.float32)

    def step(self, action_idx):
        stock_idx = action_idx // len(self.actions)
        print("action_idx", action_idx)
        print("len(self.actions)", len(self.actions))
        print("stock_idx", stock_idx)
        action_type_idx = action_idx % len(self.actions)
        print("action_type_idx", action_type_idx)
        action_type = self.actions[action_type_idx]
        print("action_type", action_type)

        base_return = self.returns_df.iloc[self.current_step, stock_idx]
        print("base_return", base_return)

        # Define risk-adjusted reward multiplier
        reward_multiplier = {
            "Buy more": 0.3,
            "Buy mid": 0.5,
            "Buy less": 1,
            "Sell more": 0.3,
            "Sell mid": 0.5,
            "Sell less": 1,
            "Keep": 1
        }[action_type]

        reward = base_return * reward_multiplier
        print("reward", reward)
        self.episode_reward += reward
        print("episode_reward", self.episode_reward)

        self.current_step += 1
        print("current_step", self.current_step)
        terminated = (self.current_step - self.start_idx) >= 2 * self.window_size
        print("terminated", terminated)
        truncated = False

        return self._get_observation(), reward, terminated, truncated, {}

# --- Linear-based Q-Network ---
class LinearDQN(nn.Module):
    def __init__(self, input_dim, seq_len, output_dim):
        super(LinearDQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(seq_len * input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.fc(x)

# --- Hyperparameters ---
stocks = ["AAPL", "MSFT", "GOOGL", "AMZN"]
actions = ["Buy more", "Buy mid", "Buy less", "Sell more", "Sell mid", "Sell less", "Keep"]
returns_df = load_stock_data(stocks)

env = StockEnv(stocks, actions, returns_df, window_size=45)

epsilon = 1.0
epsilon_min = 0.01
epsilon_decay = 0.995
gamma = 0.99
lr = 0.001
batch_size = 32
memory = deque(maxlen=2000)

input_dim = len(stocks)
seq_len = env.window_size
output_dim = env.action_space.n

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LinearDQN(input_dim, seq_len, output_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)

# --- Training Loop ---
def train_dqn(episodes):
    global epsilon
    for e in range(episodes):
        state, _ = env.reset()
        state = torch.FloatTensor(state).unsqueeze(0).to(device)  # [1, seq_len, input_dim]
        total_reward = 0

        for t in range(env.window_size):
            if random.random() <= epsilon:
                action = env.action_space.sample()
            else:
                with torch.no_grad():
                    q_values = model(state)
                    action = q_values.argmax().item()

            next_state, reward, terminated, truncated, _ = env.step(action)
            done = terminated or truncated
            total_reward += reward

            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0).to(device)
            memory.append((state, action, reward, next_state_tensor, done))
            state = next_state_tensor

            if len(memory) >= batch_size:
                minibatch = random.sample(memory, batch_size)
                states, actions_, rewards_, next_states, dones = zip(*minibatch)

                states = torch.cat(states, dim=0)  # [B, seq_len, input_dim]
                next_states = torch.cat(next_states, dim=0)
                actions_ = torch.LongTensor(actions_).unsqueeze(1).to(device)
                rewards_ = torch.FloatTensor(rewards_).unsqueeze(1).to(device)

                current_q = model(states).gather(1, actions_)
                next_q = model(next_states).max(1)[0].unsqueeze(1).detach()
                target_q = rewards_ + (gamma * next_q)

                loss = nn.MSELoss()(current_q, target_q)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

            if done:
                break

        if epsilon > epsilon_min:
            epsilon *= epsilon_decay

        print(f"Episode {e+1}/{episodes}, Total Excess Return: {total_reward:.4f}, Epsilon: {epsilon:.2f}")

train_dqn(100)


[*********************100%***********************]  4 of 4 completed


[1;30;43m串流輸出內容已截斷至最後 5000 行。[0m
stock_idx 2
action_type_idx 3
action_type Sell more
base_return 97.39436340332031
reward 29.21830902099609
episode_reward 6230.272504425049
current_step 699
terminated False
action_idx 17
len(self.actions) 7
stock_idx 2
action_type_idx 3
action_type Sell more
base_return 96.71759033203125
reward 29.015277099609374
episode_reward 6259.287781524658
current_step 700
terminated False
action_idx 23
len(self.actions) 7
stock_idx 3
action_type_idx 2
action_type Buy less
base_return 221.03762817382812
reward 221.03762817382812
episode_reward 6480.3254096984865
current_step 701
terminated False
action_idx 6
len(self.actions) 7
stock_idx 0
action_type_idx 6
action_type Keep
base_return 140.92767333984375
reward 140.92767333984375
episode_reward 6621.25308303833
current_step 702
terminated True
Episode 89/100, Total Excess Return: 6621.2531, Epsilon: 0.64
action_idx 2
len(self.actions) 7
stock_idx 0
action_type_idx 2
action_type Buy less
base_return 142.64228820

In [20]:
def load_stock_data(tickers, start="2020-01-01", end="2024-12-31"):
    data = yf.download(tickers, start=start, end=end)["Close"]
    #returns = data.pct_change().dropna()
    return data

returns_df = load_stock_data(stocks)
returns_df.head()

[*********************100%***********************]  4 of 4 completed


Ticker,AAPL,AMZN,GOOGL,MSFT
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-01-02,72.620834,94.900497,68.108376,153.323288
2020-01-03,71.914803,93.748497,67.752083,151.414108
2020-01-06,72.487846,95.143997,69.557945,151.805511
2020-01-07,72.146942,95.343002,69.423592,150.421387
2020-01-08,73.30751,94.598503,69.917725,152.817307


In [29]:
def run_inference(env, model):
    model.eval()  # set model to evaluation mode
    state, _ = env.reset()
    state = torch.FloatTensor(state).unsqueeze(0).to(device)
    total_reward = 0
    actions_taken = []

    with torch.no_grad():
        for t in range(env.window_size):
            q_values = model(state)
            action = q_values.argmax().item()
            actions_taken.append(action)

            next_state, reward, terminated, truncated, _ = env.step(action)
            total_reward += reward
            done = terminated or truncated
            if done:
                break

            state = torch.FloatTensor(next_state).unsqueeze(0).to(device)

    print(f"Inference run completed. Total reward: {total_reward:.4f}")
    print("Actions taken:", actions_taken)

    stock_idx = action // len(actions)
    action_type = actions[action % len(actions)]
    print(f"Day {t}: {stocks[stock_idx]} - {action_type}")


In [30]:
run_inference(env, model)


action_idx 27
len(self.actions) 7
stock_idx 3
action_type_idx 6
action_type Keep
base_return 325.80010986328125
reward 325.80010986328125
episode_reward 325.80010986328125
current_step 488
terminated False
action_idx 27
len(self.actions) 7
stock_idx 3
action_type_idx 6
action_type Keep
base_return 325.8487854003906
reward 325.8487854003906
episode_reward 651.6488952636719
current_step 489
terminated False
action_idx 27
len(self.actions) 7
stock_idx 3
action_type_idx 6
action_type Keep
base_return 324.0297546386719
reward 324.0297546386719
episode_reward 975.6786499023438
current_step 490
terminated False
action_idx 27
len(self.actions) 7
stock_idx 3
action_type_idx 6
action_type Keep
base_return 333.2126770019531
reward 333.2126770019531
episode_reward 1308.8913269042969
current_step 491
terminated False
action_idx 27
len(self.actions) 7
stock_idx 3
action_type_idx 6
action_type Keep
base_return 330.15814208984375
reward 330.15814208984375
episode_reward 1639.0494689941406
current_step