In [5]:
import yfinance as yf
import numpy as np
import pandas as pd
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
import pandas_ta as ta
from sklearn.preprocessing import StandardScaler

# ------------------------------
# 1. Tải và xử lý dữ liệu MSFT
# ------------------------------
# df = yf.download("MSFT", start="2010-01-01", end="2025-06-30")
# if isinstance(df.columns, pd.MultiIndex):
#     df.columns = df.columns.droplevel(1)
# df = df.reset_index()
df = pd.read_csv("stock_best_1y.csv")
df = df[['Date', 'Close', 'High', 'Low', 'Open', 'Volume']]
df.columns.name = None

# Tính chỉ báo kỹ thuật
# This automatically calculates RSI and adds a new column named 'RSI_14'
df.ta.rsi(length=14, append=True)
# This calculates EMA and adds a new column named 'EMA_14'
df.ta.ema(length=14, append=True)
# ✅ ADD THIS LINE TO FIX THE ERROR
df.rename(columns={'RSI_14': 'RSI', 'EMA_14': 'EMA'}, inplace=True)
# Clean up the DataFrame
df = df.dropna().reset_index(drop=True)

# Chuẩn hóa dữ liệu
scaler = StandardScaler()
df[['Close', 'Open', 'High', 'Low', 'Volume', 'RSI', 'EMA']] = scaler.fit_transform(
    df[['Close', 'Open', 'High', 'Low', 'Volume', 'RSI', 'EMA']]
)

# Lưu giá trị gốc của Close để tính toán giao dịch
df['Close_raw'] = df['Close'].copy()  # Placeholder, will be replaced with actual scaling reversal

# ------------------------------
# 2. Môi trường RL tùy chỉnh
# ------------------------------
class StockTradingEnv(gym.Env):
    def __init__(self, df):
        super(StockTradingEnv, self).__init__()
        self.df = df
        self.max_steps = len(df)

        # Action: 0-4: BUY (19%-51%), 5-9: SELL (50%-79%)
        self.action_space = spaces.Discrete(10)
        self.buy_pcts = np.linspace(0.24, 0.53, 5)  # [19%, 27.25%, 35.5%, 43.75%, 51%]
        self.sell_pcts = np.linspace(0.22, 0.73, 5)  # [50%, 57.25%, 64.5%, 71.75%, 79%]

        # Quan sát: [cash, stock_owned, close, moving_avg, volume, RSI, EMA]
        self.observation_space = spaces.Box(
            low=np.array([-5000, 0, -np.inf, -np.inf, -np.inf, -np.inf, -np.inf]),
            high=np.array([np.inf, np.inf, np.inf, np.inf, np.inf, np.inf, np.inf]),
            shape=(7,),
            dtype=np.float32
        )

        # Luật giao dịch
        self.init_cash = 100_000
        self.transaction_fee = 0.03
        self.penalty_sessions = 25
        self.penalty_fee = 3981
        self.total_asset_min = 1000
        self.cash_min = -5000
        self.win_threshold = 1_000_000
        self.scaler = scaler  # Store scaler for reversing normalization

    def reset(self):
        self.step_idx = 0
        self.cash = self.init_cash
        self.stock_owned = 0
        self.last_trade_step = 0
        self.prev_total_asset = self.cash  # Track previous portfolio value
        return self._get_obs()

    def _get_obs(self):
        row = self.df.iloc[self.step_idx]
        moving_avg = self.df['Close'].iloc[max(0, self.step_idx-5):self.step_idx+1].mean()
        return np.array([
            self.cash,
            self.stock_owned,
            row['Close'],
            moving_avg,
            row['Volume'],
            row['RSI'],
            row['EMA']
        ], dtype=np.float32)

    def step(self, action):
        done = False
        traded = False
        price_scaled = self.df['Close'][self.step_idx]
        # Reverse normalization to get raw price for transaction
        price_raw = price_scaled * self.scaler.scale_[df.columns.get_loc('Close')] + self.scaler.mean_[df.columns.get_loc('Close')]
        reward = 0

        # Tính giá trị danh mục trước khi thực hiện hành động
        self.prev_total_asset = self.cash + self.stock_owned * price_raw

        # Buy
        if action <= 4:
            pct = self.buy_pcts[action]
            budget = pct * self.cash
            if self.cash > 0:
                max_affordable = self.cash / (1 + self.transaction_fee)  # Max spendable after fee
                quantity = min(budget / price_raw, max_affordable / price_raw)
                cost = quantity * price_raw * (1 + self.transaction_fee)
                if cost <= self.cash:
                    self.cash -= cost
                    self.stock_owned += quantity
                    traded = True
                else:
                    reward -= 10  # Penalty for insufficient cash
            else:
                reward -= 10  # Penalty for invalid buy

        # Sell
        elif action >= 5 and self.stock_owned > 0:
            pct = self.sell_pcts[action - 5]
            quantity = pct * self.stock_owned
            if quantity > 0:
                revenue = quantity * price_raw * (1 - self.transaction_fee)
                self.cash += revenue
                self.stock_owned -= quantity
                traded = True
            else:
                reward -= 10  # Penalty for zero quantity
        elif action >= 5 and self.stock_owned == 0:
            reward -= 10  # Penalty for selling without shares

        # Phạt nếu không giao dịch quá lâu
        if traded:
            self.last_trade_step = self.step_idx
        elif self.step_idx - self.last_trade_step >= self.penalty_sessions:
            self.cash -= self.penalty_fee
            reward -= self.penalty_fee

        # Tính giá trị danh mục sau hành động
        total_asset = self.cash + self.stock_owned * price_raw
        reward += (total_asset - self.prev_total_asset) / self.init_cash  # Normalized reward

        # Điều kiện kết thúc
        self.step_idx += 1
        if self.step_idx >= self.max_steps - 1:
            done = True
        elif total_asset < self.total_asset_min or self.cash < self.cash_min:
            reward -= 1000
            done = True
        elif total_asset >= self.win_threshold:
            reward += 1000
            done = True

        return self._get_obs(), reward, done, {}

# ------------------------------
# 3. Callback để log huấn luyện
# ------------------------------
class LoggerCallback(BaseCallback):
    def __init__(self, env, verbose=1):
        super().__init__(verbose)
        self.env_ref = env
        self.last_step = 0

    def _on_step(self):
        step = self.num_timesteps
        if step - self.last_step >= 1000:
            self.last_step = step
            cash = self.env_ref.cash
            stock_val = self.env_ref.stock_owned * (self.env_ref.df['Close'][self.env_ref.step_idx] * self.env_ref.scaler.scale_[self.env_ref.df.columns.get_loc('Close')] + self.env_ref.scaler.mean_[self.env_ref.df.columns.get_loc('Close')])
            total_asset = cash + stock_val
            print(f"\n📊 Step {step} | Action: {self.locals['actions']} | Cash: ${cash:,.2f} | Stocks: ${stock_val:,.2f} | Total: ${total_asset:,.2f}")
        return True

# ------------------------------
# 4. Train PPO Agent
# ------------------------------
env = StockTradingEnv(df)
callback = LoggerCallback(env)

model = PPO(
    "MlpPolicy",
    env,
    verbose=1,
    learning_rate=1e-4,
    batch_size=64,
    n_steps=2048,
    gamma=0.99,
    tensorboard_log="./tensorboard_logs"
)

model.learn(total_timesteps=1_000_000, callback=callback)
model.save("ppo_msft_trader")
print("✅ Huấn luyện hoàn tất!")

Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.




Logging to ./tensorboard_logs\PPO_1

📊 Step 1000 | Action: [7] | Cash: $49,737.23 | Stocks: $23,653.47 | Total: $73,390.70

📊 Step 2000 | Action: [6] | Cash: $53,524.76 | Stocks: $619.40 | Total: $54,144.16
---------------------------------
| rollout/           |          |
|    ep_len_mean     | 236      |
|    ep_rew_mean     | -14.6    |
| time/              |          |
|    fps             | 821      |
|    iterations      | 1        |
|    time_elapsed    | 2        |
|    total_timesteps | 2048     |
---------------------------------

📊 Step 3000 | Action: [8] | Cash: $24,955.19 | Stocks: $14,167.60 | Total: $39,122.79

📊 Step 4000 | Action: [1] | Cash: $9,844.51 | Stocks: $19,132.10 | Total: $28,976.61
-----------------------------------------
| rollout/                |             |
|    ep_len_mean          | 236         |
|    ep_rew_mean          | -13.8       |
| time/                   |             |
|    fps                  | 558         |
|    iterations           | 

In [4]:
import yfinance as yf
import numpy as np
import pandas as pd
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.callbacks import BaseCallback
from stable_baselines3.common.env_checker import check_env
import pandas_ta as ta
from sklearn.preprocessing import StandardScaler

# ------------------------------
# 1. Load and Process Data
# ------------------------------
df = pd.read_csv("stock.csv")
df = df[['Date', 'Close', 'High', 'Low', 'Open', 'Volume']]
df.columns.name = None

# Calculate technical indicators
df.ta.rsi(length=14, append=True)
df.ta.ema(length=14, append=True)
df.rename(columns={'RSI_14': 'RSI', 'EMA_14': 'EMA'}, inplace=True)

# Save the raw close prices for reward calculation and evaluation later
df['Close_raw'] = df['Close'].copy()
df = df.dropna().reset_index(drop=True)

# Scale the features
scaler = StandardScaler()
scaled_features = ['Close', 'Open', 'High', 'Low', 'Volume', 'RSI', 'EMA']
df[scaled_features] = scaler.fit_transform(df[scaled_features])


# ------------------------------
# 2. Split Data into Train and Test Sets
# ------------------------------
train_size = int(len(df) * 0.8)
train_df = df[:train_size].reset_index(drop=True)
test_df = df[train_size:].reset_index(drop=True)


# ------------------------------
# 3. Custom Reinforcement Learning Environment
# ------------------------------
class StockTradingEnv(gym.Env):
    def __init__(self, df, scaler):
        super(StockTradingEnv, self).__init__()
        self.df = df
        self.scaler = scaler # Store the scaler
        self.max_steps = len(df)
        self.action_space = spaces.Discrete(10)
        self.buy_pcts = np.linspace(0.24, 0.53, 5)  
        self.sell_pcts = np.linspace(0.22, 0.73, 5) 
        self.observation_space = spaces.Box(
            low=-np.inf, high=np.inf, shape=(7,), dtype=np.float32
        )
        self.init_cash = 100_000
        self.transaction_fee = 0.03
        self.penalty_sessions = 25
        self.penalty_fee = 3981
        self.total_asset_min = 1000
        self.cash_min = -5000
        self.win_threshold = 1_000_000

    def reset(self):
        self.step_idx = 0
        self.cash = self.init_cash
        self.stock_owned = 0
        self.cost_basis = 0
        self.last_trade_step = 0
        self.prev_total_asset = self.cash
        return self._get_obs()

    def _get_obs(self):
        # FIX: Clamp the index to the last valid row to prevent IndexError
        # when the episode ends and an observation for the terminal state is needed.
        obs_idx = min(self.step_idx, self.max_steps - 1)
        row = self.df.iloc[obs_idx]
        moving_avg = self.df['Close'].iloc[max(0, obs_idx - 5):obs_idx + 1].mean()
        return np.array([
            self.cash / self.init_cash,
            self.stock_owned,
            row['Close'],
            moving_avg,
            row['Volume'],
            row['RSI'],
            row['EMA']
        ], dtype=np.float32)

    def step(self, action):
        done = False
        traded = False
        # Use Close_raw for actual transaction calculations
        price_raw = self.df['Close_raw'].iloc[self.step_idx]
        reward = 0

        self.prev_total_asset = self.cash + self.stock_owned * price_raw

        # --- BUY ---
        if action <= 4:
            pct = self.buy_pcts[action]
            budget = pct * self.cash
            if budget > 1:
                quantity = (budget / price_raw) * (1 - self.transaction_fee)
                cost = quantity * price_raw * (1 + self.transaction_fee)
                new_total_cost = self.cost_basis * self.stock_owned + cost
                self.stock_owned += quantity
                self.cost_basis = new_total_cost / self.stock_owned if self.stock_owned > 0 else 0
                self.cash -= cost
                traded = True
        # --- SELL ---
        elif action >= 5 and self.stock_owned > 0:
            pct = self.sell_pcts[action - 5]
            quantity = pct * self.stock_owned
            if quantity > 0:
                revenue = quantity * price_raw * (1 - self.transaction_fee)
                self.cash += revenue
                profit = revenue - (quantity * self.cost_basis)
                if profit > 0:
                    reward += profit * 0.1
                self.stock_owned -= quantity
                if self.stock_owned < 1e-6:
                    self.stock_owned = 0
                    self.cost_basis = 0
                traded = True
        
        total_asset = self.cash + self.stock_owned * price_raw
        reward += (total_asset - self.prev_total_asset) / self.init_cash
        if self.cash < 1:
            reward -= 0.01

        if traded:
            self.last_trade_step = self.step_idx
        elif self.step_idx - self.last_trade_step >= self.penalty_sessions:
            self.cash -= self.penalty_fee
            reward -= self.penalty_fee / self.init_cash

        self.step_idx += 1
        if self.step_idx >= self.max_steps or total_asset < self.total_asset_min or self.cash < self.cash_min:
            done = True
        elif total_asset >= self.win_threshold:
            reward += 10
            done = True
        
        # The _get_obs method now handles the index clamping internally.
        return self._get_obs(), reward, done, {}

class LoggerCallback(BaseCallback):
    def __init__(self, env, verbose=1):
        super().__init__(verbose)
        # The environment is vectorized, so we need to access the underlying environment
        # FIX: Access the actual environment inside the Monitor wrapper
        self.env_ref = env.envs[0]
        self.last_step = 0

    def _on_step(self):
        step = self.num_timesteps
        if step - self.last_step >= 5000: # Log every 5000 steps
            self.last_step = step
            
            # FIX: Access the attributes from the unwrapped environment
            # Access the Monitor's internal env to get the custom attributes
            actual_env = self.training_env.envs[0].env
            cash = actual_env.cash
            stock_owned = actual_env.stock_owned
            step_idx = min(actual_env.step_idx, len(actual_env.df) - 1)
            
            # Use the raw price for accurate valuation
            price_raw = actual_env.df['Close_raw'].iloc[step_idx]
            stock_val = stock_owned * price_raw
            total_asset = cash + stock_val
            
            # Print the real-time portfolio status
            print(f"\n📊 Step {step} | Action: {self.locals['actions']} | Cash: ${cash:,.2f} | Stocks: ${stock_val:,.2f} | Total: ${total_asset:,.2f}")
        return True

# ------------------------------
# 4. Train PPO Agent
# ------------------------------
env_train = StockTradingEnv(train_df, scaler)
# check_env(env_train) # Optional: check if the env follows the Gym API

model = PPO(
    "MlpPolicy",
    env_train,
    verbose=1,
    learning_rate=3e-5,
    batch_size=64,
    n_steps=2048,
    gamma=0.99,
    tensorboard_log="./tensorboard_logs"
)

# --- FIX: Instantiate and use the LoggerCallback ---
# The callback now correctly accesses the underlying environment
callback = LoggerCallback(model.get_env())

print("🚀 Starting model training...")
# Restore timesteps for a full run and pass the callback
model.learn(total_timesteps=1_000_000, progress_bar=False, callback=callback)
model.save("ppo_stock_trader_v3")
print("✅ Training complete!")


# ------------------------------
# 5. Evaluate the Trained Agent with Step-by-Step Logging
# ------------------------------
print("\n🧪 Starting evaluation on test data...")

model = PPO.load("ppo_stock_trader_v3")
env_test = StockTradingEnv(test_df, scaler)
obs = env_test.reset()

print("\n--- STEP-BY-STEP BACKTEST ---")
print(f"{'Step':<5}{'Date':<12}{'Action':<12}{'Price':<10}{'Cash':<15}{'Stocks':<12}{'Portfolio':<15}")
print("-" * 83)

# Loop until the environment is done
for i in range(env_test.max_steps):
    action_idx, _states = model.predict(obs, deterministic=True)
    obs, reward, done, info = env_test.step(action_idx)

    step_num = env_test.step_idx -1
    date = test_df['Date'].iloc[step_num]
    price = test_df['Close_raw'].iloc[step_num]
    cash = env_test.cash
    stocks_owned = env_test.stock_owned
    portfolio_value = cash + stocks_owned * price

    if action_idx <= 4:
        action_str = f"BUY ({env_test.buy_pcts[action_idx]*100:.0f}%)"
    else:
        action_str = f"SELL ({env_test.sell_pcts[action_idx-5]*100:.0f}%)"
    
    print(f"{step_num:<5}{date:<12}{action_str:<12}${price:<9.2f}${cash: <14,.2f}{stocks_owned:<12.2f}${portfolio_value: <14,.2f}")

    if done:
        print("--- Backtest finished ---")
        break

# --- Final portfolio summary ---
# Use the last available price for the final calculation
final_price = test_df['Close_raw'].iloc[-1]
final_portfolio_value = env_test.cash + env_test.stock_owned * final_price
profit = final_portfolio_value - env_test.init_cash
profit_pct = (profit / env_test.init_cash) * 100

print("\n--- FINAL RESULTS ---")
print(f"Initial Portfolio: ${env_test.init_cash:,.2f}")
print(f"Final Portfolio:   ${final_portfolio_value:,.2f}")
print(f"Profit/Loss:       ${profit:,.2f} ({profit_pct:.2f}%)")
print("-----------------------")


Using cpu device
Wrapping the env with a `Monitor` wrapper
Wrapping the env in a DummyVecEnv.
🚀 Starting model training...
Logging to ./tensorboard_logs\PPO_10




---------------------------------
| rollout/           |          |
|    ep_len_mean     | 726      |
|    ep_rew_mean     | 16       |
| time/              |          |
|    fps             | 2218     |
|    iterations      | 1        |
|    time_elapsed    | 0        |
|    total_timesteps | 2048     |
---------------------------------
-------------------------------------------
| rollout/                |               |
|    ep_len_mean          | 738           |
|    ep_rew_mean          | 13.2          |
| time/                   |               |
|    fps                  | 1436          |
|    iterations           | 2             |
|    time_elapsed         | 2             |
|    total_timesteps      | 4096          |
| train/                  |               |
|    approx_kl            | 0.00068938825 |
|    clip_fraction        | 0             |
|    clip_range           | 0.2           |
|    entropy_loss         | -2.3          |
|    explained_variance   | -0.00532      |


In [9]:
import numpy as np
import pandas as pd
import gym
from gym import spaces
from stable_baselines3 import PPO
from stable_baselines3.common.env_checker import check_env
import asyncio
import platform

# Tham số từ tài liệu
BUY_MIN = 0.24  # 24%
BUY_MAX = 0.53  # 53%
SELL_MIN = 0.22  # 22%
SELL_MAX = 0.73  # 73%
TRANSACTION_FEE = 0.03  # 3%
TRANSACTION_SESSION = 25  # Số phiên không giao dịch
TRANSACTION_PENALTY = 3981  # USD
INITIAL_CASH = 1000  # 1000 USD (theo hình ảnh)
WIN_CONDITION = 1000000  # 1,000,000 USD
LOSE_CONDITION = -5000  # -5,000 USD

class StockTradingEnv(gym.Env):
    def __init__(self, df):
        super(StockTradingEnv, self).__init__()
        self.df = df
        self.current_step = 0
        self.cash = INITIAL_CASH
        self.shares = 0
        self.total_assets = INITIAL_CASH
        self.no_trade_sessions = 0

        # Observation space: [cash, shares, OHLC prices, total_assets]
        self.observation_space = spaces.Box(
            low=0, high=np.inf, shape=(6,), dtype=np.float32
        )

        # Action space: [action_type (0: hold, 1: buy, 2: sell), amount (0 to 1)]
        self.action_space = spaces.Box(
            low=np.array([0, 0]), high=np.array([2, 1]), dtype=np.float32
        )

        self.history_log = []

    def reset(self):
        self.current_step = 0
        self.cash = INITIAL_CASH
        self.shares = 0
        self.total_assets = INITIAL_CASH
        self.no_trade_sessions = 0
        self.history_log = []
        return self._get_observation()

    def _get_observation(self):
        row = self.df.iloc[self.current_step]
        return np.array([
            self.cash,
            self.shares,
            row['Open'],
            row['High'],
            row['Low'],
            row['Close']
        ], dtype=np.float32)

    def step(self, action):
        action_type = int(action[0])
        amount = action[1]
        done = False
        reward = 0
        row = self.df.iloc[self.current_step]
        current_price = row['Close']

        # Thực hiện hành động
        if action_type == 1:  # Buy
            buy_percentage = BUY_MIN + (BUY_MAX - BUY_MIN) * amount
            buy_amount = self.cash * buy_percentage
            shares_to_buy = (buy_amount * (1 - TRANSACTION_FEE)) / current_price
            self.cash -= buy_amount
            self.shares += shares_to_buy
            self.no_trade_sessions = 0
        elif action_type == 2:  # Sell
            sell_percentage = SELL_MIN + (SELL_MAX - SELL_MIN) * amount
            shares_to_sell = self.shares * sell_percentage
            sell_amount = shares_to_sell * current_price * (1 - TRANSACTION_FEE)
            self.cash += sell_amount
            self.shares -= shares_to_sell
            self.no_trade_sessions = 0
        else:  # Hold
            self.no_trade_sessions += 1
            if self.no_trade_sessions >= TRANSACTION_SESSION:
                self.cash -= TRANSACTION_PENALTY
                self.no_trade_sessions = 0
                reward -= TRANSACTION_PENALTY

        # Cập nhật tổng tài sản
        self.total_assets = self.cash + self.shares * current_price

        # Kiểm tra điều kiện dừng
        if self.total_assets >= WIN_CONDITION:
            reward += 10000
            done = True
        elif self.cash <= LOSE_CONDITION:
            reward -= 10000
            done = True

        # Tính reward dựa trên thay đổi tổng tài sản
        if self.current_step > 0:
            prev_assets = self.cash + self.shares * self.df.iloc[self.current_step - 1]['Close']
            reward += (self.total_assets - prev_assets)

        # Ghi log lịch sử
        self.history_log.append({
            'step': self.current_step,
            'cash': self.cash,
            'shares': self.shares,
            'total_assets': self.total_assets,
            'action': action_type,
            'amount': amount
        })

        self.current_step += 1
        if self.current_step >= len(self.df):
            done = True

        return self._get_observation(), reward, done, {}

    def render(self, mode='human'):
        print(f"Step: {self.current_step}, Cash: {self.cash:.2f}, Shares: {self.shares:.2f}, Total Assets: {self.total_assets:.2f}")


# Đọc dữ liệu từ file CSV
df = pd.read_csv("stock_1y.csv")
env = StockTradingEnv(df)
print(check_env(env))

# Huấn luyện mô hình PPO
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=10000)

# Lưu mô hình
model.save("ppo_stock_trading")

# Kiểm tra mô hình trên cùng dữ liệu (do chỉ có 1 năm)
obs = env.reset()
total_reward = 0
done = False
while not done:
    action, _ = model.predict(obs)
    obs, reward, done, _ = env.step(action)
    total_reward += reward
    env.render()


  logger.warn(f"Box bound precision lowered by casting to {self.dtype}")


AssertionError: Your environment must inherit from the gymnasium.Env class cf. https://gymnasium.farama.org/api/env/