# Reinforcement Learning Based Trading Agent 


## 1. Install and Import Libraries

In [None]:
import numpy as np
import pandas as pd
import yfinance as yf
import matplotlib.pyplot as plt


## 2. Download Market Data 

In [None]:
symbol = "AAPL"

data = yf.download(tickers= "AAPL",start="2020-01-01",end= "2026-01-07")

prices = data["Close"].to_numpy().flatten()

print("Trading days:", len(prices))
print("Sample price:", prices[0], type(prices[0]))


[*********************100%***********************]  1 of 1 completed

Trading days: 1511
Sample price: 72.46826934814453 <class 'numpy.float64'>





## 3. Trading Environment

In [None]:
# Custom trading environment for Reinforcement Learning
CAPITAL = 1000
class TradingEnv:
    def __init__(self, prices):
        self.prices = prices
        self.reset()

    def reset(self):
        self.t = 0
        self.cash = CAPITAL
        # 0 = no stock, 1 = holding stock
        self.stock = False
        self.done = False
        
        return self._get_state()

    def _get_state(self):
        # State should contain:
        # 1. current price
        # 2. stock holding (0 or 1)
        state = np.array([self.prices[self.t], self.stock],dtype=np.float32)
        return state

    def step(self, action):
        price = self.prices[self.t]

        # Action 0 → Hold (do nothing)
        # Action 1 → Buy (only if enough cash)
        # Action 2 → Sell (only if holding stock)
        if action == 0:
            pass
        elif action == 1 and (not self.stock) and self.cash >= price:
            self.stock = 1
            self.cash -= price
        elif action == 2 and self.stock:
                self.cash += price
                self.stock = False


        # TODO: move to next time step
        self.t += 1

        # TODO: check termination condition
        if self.t >= len(self.prices):
             self.done = True
             self.t = len(prices) - 1

        # TODO: define reward (portfolio value)
        reward = self.cash + self.stock*price

        # TODO: return next_state, reward, done
        return (self._get_state(), reward, self.done)


## 4. Q-Learning Setup

In [None]:

# number of states = number of time steps
# number of actions = 3 (Hold, Buy, Sell)
Q = np.zeros((len(prices), 3))

# set learning rate (alpha)
alpha = 0.1

# set discount factor (gamma)
gamma = 0.95

# set exploration rate (epsilon)
epsilon = 1
decay = 0.9997

# transaction fee (only to discourage agent from making toooo many trades, not really deducted)
fee = 1


## 5. Train the Agent

In [None]:

# create trading environment
env = TradingEnv(prices)

# set number of training episodes
episodes = 10000

# training loop
for episode in range(episodes):
    
    # reset environment at start of each episode
    state = env.reset()

    # loop until episode ends
    while not env.done:
        
        # get current state index (time step)
        t = env.t

        # epsilon-greedy action selection
        if np.random.random() < epsilon:
            action = np.random.randint(0,3)
        else:
            action = np.argmax(Q[t])

        # take action in environment
        state, reward, done = env.step(action)

        # update Q-value using Bellman equation
        if env.done:
            Q[t, action] += alpha * (reward - Q[t, action] -fee)
        else:
            Q[t, action] += alpha * (
                reward +
                gamma * np.max(Q[t + 1]) -
                Q[t, action] - fee
            )
    epsilon = max(0.01,epsilon*decay)


# indicate training completion
print("Traning Complete.\nMuhahahaha\n")



Traning Complete.
Muhahahaha



## 6. Evaluate Trained Agent

In [None]:

# create a new environment for evaluation
env = TradingEnv(prices)


# run the trained agent without exploration
while not env.done:
    
    # get current state index (time step)
    t = env.t
    
    # select best action from Q-table
    action = np.argmax(Q[t])
    
    # apply action in environment
    env.step(action)

# compute final portfolio value
final_value = env.cash + env.stock*prices[-1]
# print final result
print(f"You now have : \nWait for it\n.........\n.........\n.........\n.........\n\n${final_value}")



You now have : 
Wait for it
.........
.........
.........
.........

$1255.7932815551758


## 7. Buy and Hold Baseline

In [None]:
# implement Buy-and-Hold baseline strategy
# - Buy one stock on the first day
# - Hold it until the last day
# - Start with initial cash of 1000

buy_and_hold_value = 1000 - prices[0] + prices[-1]
# print Buy-and-Hold portfolio value
print(f"You noob i earned \n\n{buy_and_hold_value}.")



You noob i earned 

1189.891716003418.
