<a href="https://colab.research.google.com/github/tongnet/fin7047_2026spr/blob/main/rl.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Reinforcement learning example
# The agent learns when to hold Bitcoin using rewards, then tests the strategy out of sample.

import numpy as np
import pandas as pd
import yfinance as yf

# 1) Download BTC daily data (past 1 year)
start_date = "2025-01-24"
end_date   = "2026-01-25"  # end is exclusive
df = yf.download("BTC-USD", start=start_date, end=end_date, interval="1d", progress=False)

price_col = "Close"
prices = df[price_col].dropna()
rets = prices.pct_change().dropna().values

# 2) Two-state function: yesterday up (1) or not (0)
def state_from_return(r):
    return 1 if r > 0 else 0

# 3) Train/test split in time order
T = len(rets)
split = int(0.70 * T)
train = rets[:split]
test  = rets[split:]

# 4) Q-learning (2 states x 2 actions)
# Actions: 0 = stay out, 1 = hold BTC
Q = np.zeros((2, 2))
alpha = 0.10
gamma = 0.95
eps0 = 0.20
eps_min = 0.02
episodes = 30
switch_cost = 0.001  # 0.10% cost if you switch position (optional)

np.random.seed(42)

for ep in range(episodes):
    eps = max(eps_min, eps0 * (0.90 ** ep))
    pos = 0  # start out of market (0=out, 1=hold)

    for t in range(1, len(train)-1):
        s = state_from_return(train[t-1])

        # choose action (epsilon-greedy)
        if np.random.rand() < eps:
            a = np.random.randint(0, 2)
        else:
            a = int(np.argmax(Q[s]))

        # switching cost if position changes
        cost = switch_cost if a != pos else 0.0

        # reward realized next day
        reward = (train[t+1] if a == 1 else 0.0) - cost

        s_next = state_from_return(train[t])
        best_next = np.max(Q[s_next])

        Q[s, a] = Q[s, a] + alpha * (reward + gamma * best_next - Q[s, a])

        pos = a  # action defines your position for next day

# 5) Evaluate learned policy on test set
pos = 0
equity = 1.0
trades = 0

for t in range(1, len(test)-1):
    s = state_from_return(test[t-1])
    a = int(np.argmax(Q[s]))  # best action in this state

    cost = 0.0
    if a != pos:
        cost = switch_cost
        trades += 1

    r_next = test[t+1] if a == 1 else 0.0
    equity *= (1.0 + r_next - cost)

    pos = a

# Buy-and-hold benchmark on test period
bh_equity = np.prod(1.0 + test[2:])

print("Learned Q-values (rows=state Down/Up, cols=action Out/Hold):")
print(Q.round(4))

print("\nTest performance:")
print("RL final equity:", round(float(equity), 4), "x")
print("RL trades:", trades)
print("Buy&Hold final equity:", round(float(bh_equity), 4), "x")


Learned Q-values (rows=state Down/Up, cols=action Out/Hold):
[[-0.0002 -0.0055]
 [-0.0002 -0.0015]]

Test performance:
RL final equity: 1.0 x
RL trades: 0
Buy&Hold final equity: 0.7224 x


  df = yf.download("BTC-USD", start=start_date, end=end_date, interval="1d", progress=False)
  Q[s, a] = Q[s, a] + alpha * (reward + gamma * best_next - Q[s, a])
