In [None]:
import time
import pickle
import numpy as np
from src.environment import StockTradingEnvironment
from src.utils import save_pickle, load_pickle, plot_grid
from src.learner import q_learning_learning_loop
import yfinance as yf
import random
import matplotlib.pyplot as plt
from collections import deque
import tensorflow as tf
from tensorflow.keras import models, layers, optimizers
from tensorflow.keras.callbacks import LearningRateScheduler

# Logging setup for better tracking of progress
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

physical_devices = tf.config.list_physical_devices('GPU')
if physical_devices:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    logging.info("Using GPU for training")
else:
    logging.info("Using CPU for training")

In [None]:
# yfinance download data
def fetch_stock_data(symbol, start_date, end_date, output_file):
    stock_data = yf.download(symbol, start=start_date, end=end_date)
    stock_data['Close'] = stock_data['Adj Close']
    
    stock_data = stock_data.drop(columns=['Adj Close'])

    stock_data.to_csv(output_file)
        
    return stock_data

stock_data = fetch_stock_data('AAPL', '2022-01-01', '2024-06-19', 'AAPL_data.csv')

#### Q-Learning

In [None]:
class DoubleQLearningAgent:
    def __init__(self, env, learning_rate, discount_factor):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        # Use a dictionary to handle the Q-tables with discrete state-action pairs
        self.q_table1 = {}
        self.q_table2 = {}

    def step(self, state, epsilon):
        state_index = self._get_state_index(state)
        if np.random.rand() < epsilon:
            return np.random.choice(self.env.action_space.n)
        else:
            q_values = self.q_table1.get(state_index, np.zeros(self.env.action_space.n)) + self.q_table2.get(state_index, np.zeros(self.env.action_space.n))
            return np.argmax(q_values)

    def update_qvalue(self, state, action, reward, next_state):
        state_index = self._get_state_index(state)
        next_state_index = self._get_state_index(next_state)
        if np.random.rand() < 0.5:
            best_next_action = np.argmax(self.q_table1.get(next_state_index, np.zeros(self.env.action_space.n)))
            self.q_table1[state_index] = self.q_table1.get(state_index, np.zeros(self.env.action_space.n))
            self.q_table1[state_index][action] += self.learning_rate * (
                reward + self.discount_factor * self.q_table2.get(next_state_index, np.zeros(self.env.action_space.n))[best_next_action] - self.q_table1[state_index][action]
            )
        else:
            best_next_action = np.argmax(self.q_table2.get(next_state_index, np.zeros(self.env.action_space.n)))
            self.q_table2[state_index] = self.q_table2.get(state_index, np.zeros(self.env.action_space.n))
            self.q_table2[state_index][action] += self.learning_rate * (
                reward + self.discount_factor * self.q_table1.get(next_state_index, np.zeros(self.env.action_space.n))[best_next_action] - self.q_table2[state_index][action]
            )

    def _get_state_index(self, state):
        return tuple(state.flatten())

# Updated DQNAgent class
class DQNAgent:
    def __init__(self, env, learning_rate, discount_factor, buffer_size=10000, batch_size=32):
        self.env = env
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.buffer_size = buffer_size
        self.batch_size = batch_size
        self.memory = deque(maxlen=buffer_size)
        self.model = self.build_model()
        self.target_model = self.build_model()
        self.update_target_model()

    def build_model(self):
        model = models.Sequential()
        input_dim = np.prod(self.env.observation_space.shape)
        model.add(layers.Dense(24, input_dim=input_dim, activation='relu'))
        model.add(layers.Dense(24, activation='relu'))
        model.add(layers.Dense(self.env.action_space.n, activation='linear'))
        model.compile(optimizer=optimizers.Adam(learning_rate=self.learning_rate), loss='mse')
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state, epsilon):
        if np.random.rand() <= epsilon:
            return np.random.choice(self.env.action_space.n)
        state = state.flatten().reshape(1, -1)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])

    def replay(self):
        if len(self.memory) < self.batch_size:
            return
        minibatch = random.sample(self.memory, self.batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = state.flatten().reshape(1, -1)
            next_state = next_state.flatten().reshape(1, -1)
            target = reward
            if not done:
                target = reward + self.discount_factor * np.amax(self.target_model.predict(next_state)[0])
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.fit(state, target_f, epochs=1, verbose=0)
        self.update_target_model()

In [None]:
def collect_experience(env, agent, epsilon):
    obs, _ = env.reset()
    terminated, truncated = False, False
    experiences = []

    while not terminated and not truncated:
        current_action = agent.step(obs, epsilon)
        next_obs, reward, terminated, truncated, _ = env.step(current_action)
        experiences.append((obs, current_action, reward, next_obs, terminated))
        obs = next_obs
    
    return experiences

def adaptive_learning_rate(epoch, lr):
    if epoch % 100 == 0 and epoch:
        return lr * 0.1
    return lr

def decay_epsilon(episode, epsilon, min_epsilon, decay_rate):
    return max(min_epsilon, epsilon * decay_rate)

def q_learning_learning_loop(env, agent, learning_rate: float, discount_factor: float, episodes: int,
                             min_epsilon_allowed: float, initial_epsilon_value: float,
                             buffer_size: int = 10000, batch_size: int = 32) -> tuple:
    epsilon = initial_epsilon_value
    epsilon_decay_factor = np.power(min_epsilon_allowed / epsilon, 1 / episodes)

    reward_across_episodes = []
    epsilons_across_episodes = []

    replay_buffer = deque(maxlen=buffer_size)

    for episode in range(episodes):
        experiences = collect_experience(env, agent, epsilon)
        replay_buffer.extend(experiences)

        if len(replay_buffer) >= batch_size:
            batch = np.array(random.sample(replay_buffer, batch_size), dtype=object)
            for b_state, b_action, b_reward, b_next_state, b_done in batch:
                b_state = tuple(b_state.flatten())  # Convert to tuple
                b_next_state = tuple(b_next_state.flatten())  # Convert to tuple
                if b_done:
                    target = b_reward
                else:
                    target = b_reward + discount_factor * np.max(agent.q_table1.get(b_next_state, np.zeros(env.action_space.n)) + agent.q_table2.get(b_next_state, np.zeros(env.action_space.n)))

                agent.q_table1[b_state] = agent.q_table1.get(b_state, np.zeros(env.action_space.n))
                agent.q_table1[b_state][b_action] += learning_rate * (target - agent.q_table1[b_state][b_action])

        epsilon = decay_epsilon(episode, epsilon, min_epsilon_allowed, epsilon_decay_factor)
        reward_across_episodes.append(sum([exp[2] for exp in experiences]))
        epsilons_across_episodes.append(epsilon)

        if (episode + 1) % 100 == 0:
            logging.info(f"Episode {episode + 1}/{episodes} - Reward: {reward_across_episodes[-1]} - Epsilon: {epsilon}")

    logging.info("Trained Q-Table: %s", agent.q_table1)

    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    plt.plot(reward_across_episodes, label='Rewards')
    plt.xlabel('Episodes')
    plt.ylabel('Cumulative Reward')
    plt.title('Rewards over Episodes')
    plt.legend()
    plt.grid(True)

    plt.subplot(1, 2, 2)
    plt.plot(epsilons_across_episodes, label='Epsilon')
    plt.xlabel('Episodes')
    plt.ylabel('Epsilon')
    plt.title('Epsilon Decay over Episodes')
    plt.legend()
    plt.grid(True)

    plt.tight_layout()
    plt.show()

    return agent, reward_across_episodes, epsilons_across_episodes

In [None]:
# NOTE: You can adjust the parameter 'number_of_days_to_consider'

env = stock_trading_environment = StockTradingEnvironment('./AAPL_data.csv', number_of_days_to_consider=30)

In [None]:
agent = DoubleQLearningAgent(env, learning_rate=0.01, discount_factor=0.99)

agent, reward_across_episodes, epsilons_across_episodes = q_learning_learning_loop(
    env,
    agent,
    learning_rate=0.01,
    discount_factor=0.99,
    episodes=50000,
    min_epsilon_allowed=0.01,
    initial_epsilon_value=1,
    buffer_size=100000,
    batch_size=64,
)

In [None]:
def run_learned_policy(env, agent):
    obs, _ = env.reset()
    terminated, truncated = False, False
    total_reward = 0
    steps = 0
    
    while not terminated and not truncated:
        action = np.argmax(agent.q_table1.get(obs, np.zeros(env.action_space.n)) + agent.q_table2.get(obs, np.zeros(env.action_space.n)))
        obs, reward, terminated, truncated, _ = env.step(action)
        total_reward += reward
        steps += 1
    
    logging.info("Total Reward: %d, Steps: %d", total_reward, steps)
    return total_reward

total_reward = run_learned_policy(env, agent)

In [None]:
def plot_grid(env, agent, reward_across_episodes: list, epsilons_across_episodes: list) -> None:
    env.train = False
    total_reward_learned_policy = [run_learned_policy(env, agent) for _ in range(30)]

    plt.figure(figsize=(15, 10))

    # Main plot
    plt.subplot(2, 2, 1)
    plt.plot(reward_across_episodes, 'ro')
    plt.xlabel('Episode')
    plt.ylabel('Reward Value')
    plt.title('Rewards Per Episode (Training)')
    plt.grid()

    plt.subplot(2, 2, 2)
    plt.plot(total_reward_learned_policy, 'ro')
    plt.xlabel('Episode')
    plt.ylabel('Reward Value')
    plt.title('Rewards Per Episode (Learned Policy Evaluation)')
    plt.grid()

    # Extra plots
    plt.subplot(2, 2, 3)
    plt.plot(reward_across_episodes)
    plt.xlabel('Episode')
    plt.ylabel('Cumulative Reward Per Episode (Training)')
    plt.title('Cumulative Reward vs Episode')
    plt.grid()

    plt.subplot(2, 2, 4)
    plt.plot(epsilons_across_episodes)
    plt.xlabel('Episode')
    plt.ylabel('Epsilon Values')
    plt.title('Epsilon Decay')
    plt.grid()

    plt.tight_layout()
    plt.show()

plot_grid(stock_trading_environment, agent, reward_across_episodes, epsilons_across_episodes)

In [None]:
stock_trading_environment = StockTradingEnvironment('./AAPL_data.csv', number_of_days_to_consider=30)
stock_trading_environment.train = False
obs, _ = stock_trading_environment.reset()
terminated, truncated = False, False
while not terminated:
    action = np.argmax(agent.q_table1.get(obs, np.zeros(env.action_space.n)) + agent.q_table2.get(obs, np.zeros(env.action_space.n)))
    obs, reward, terminated, truncated, info = stock_trading_environment.step(action)

stock_trading_environment.render()

In [None]:
save_pickle(agent, 'stock_env_q_learning_agent.pkl')

#### Forecast

In [None]:
agent = load_pickle("stock_env_q_learning_agent.pkl")

In [None]:
stock_trading_environment = StockTradingEnvironment('./AAPL_data.csv', number_of_days_to_consider=10)
stock_trading_environment.train = False
obs, _ = stock_trading_environment.reset()
terminated, truncated = False, False
while not terminated:
    action = np.argmax(agent.q_table1.get(np.argmax(obs), np.zeros(env.action_space.n)) + agent.q_table2.get(np.argmax(obs), np.zeros(env.action_space.n)))
    obs, reward, terminated, truncated, info = stock_trading_environment.step(action)

stock_trading_environment.render()