# Exercise 1

## Reinforcement Learning


---

## Overview

Welcome to this Excercise. We are now going to use our new skills to build our first Deep Learning Reinforcement Learning Model. 




In [1]:
import sys
import os

# Manually set the path relative to the py file's location that you want to import
func_lib_path = os.path.abspath(os.path.join(os.getcwd(), '../../'))# Add the path to sys.path
sys.path.append(func_lib_path)

# Now you can import func_lib
import func_lib
import random
import numpy as np
import pandas as pd
import gymnasium as gym
from collections import defaultdict
from gymnasium import spaces
import matplotlib.pyplot as plt

In [2]:
# Define the custom gym environment for trading returns
class ReturnEnv(gym.Env):
    def __init__(self, df):
        super(ReturnEnv, self).__init__()
        
        # Define the action and observation spaces
        # - Action space is discrete with two possible actions: 0 or 1.
        # - Observation space is a continuous box with shape (2,) to hold the return values.
        self.action_space = spaces.Discrete(2)  # 0 or 1
        self.observation_space = spaces.Box(low=-np.inf, high=np.inf, shape=(2,), dtype=np.float32)
        
        # Initialize the dataframe and step counter within self
        # - Store the dataframe containing the trading data.
        # - Initialize the current step to 0.


    def reset(self):
        # Reset the environment to the initial state
        # - Set the current step back to 0.
        # - Return the initial state based on the first row of the dataframe.

        state = self.df.iloc[self.current_step][['1_d_returns']].values
        return state
        
    def step(self, action):
        # Retrieve the target return for the current step from the DataFrame.
        target = self.df.iloc[self.current_step]['Target_Returns']
        
        # Calculate the reward based on the action taken.
        # Reward is 1 if the action matches the target return, otherwise -1.

        
        # Move to the next step in the DataFrame.

        
        # Check if the episode is done.
        # An episode is considered done when all steps in the DataFrame have been processed.
        done = self.current_step >= len(self.df)
        
        # Determine the next state to return.
        # If the episode is not done, retrieve the '1_d_returns' value for the next step.
        if not done:
            next_state = self.df.iloc[self.current_step][['1_d_returns']].values
        else:
            # If the episode is done, return an array of zeros.
            next_state = np.zeros(2)
        
        # Return the next state, the computed reward, the done flag, and an empty dictionary for additional information.
        return next_state, reward, done, {}


In [3]:
# Q-learning Agent
class QLearningAgent:
    def __init__(self, action_space, state_space, alpha=0.1, gamma=0.99, epsilon=0.1):
        # Initialize the Q-learning agent with parameters
        # - Set the number of possible actions based on action_space.
        # - Set the number of possible states based on state_space.
        # - Initialize the learning rate (alpha) to 0.1 or another value as needed.
        # - Initialize the discount factor (gamma) to 0.99 or another value as needed.
        # - Initialize the exploration rate (epsilon) to 0.1 or another value as needed.
        # - Create the Q-table as a defaultdict with zero-initialized arrays for each state-action pair.
        self.action_space = action_space
        self.state_space = state_space
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.q_table = defaultdict(lambda: np.zeros(action_space.n))
    
    def choose_action(self, state):
        # Choose an action based on the exploration-exploitation trade-off
        # - Generate a random number between 0 and 1.
        # - If the random number is less than epsilon, choose a random action (exploration).
        # - Otherwise, choose the action with the highest Q-value for the current state (exploitation).
        if random.uniform(0, 1) < self.epsilon:
            return self.action_space.sample()  # Explore by choosing a random action
        else:
            return np.argmax(self.q_table[str(state)])  # Exploit by choosing the best action based on Q-values
    
    def update(self, state, action, reward, next_state):
        # Update the Q-values based on the agent's experience
        # - Find the best action for the next state by checking Q-values.
        # - Calculate the target Q-value (td_target) using the reward and the maximum Q-value for the next state.
        # - Compute the temporal difference (TD) error as the difference between the target Q-value and the current Q-value.
        # - Update the Q-value for the current state-action pair using the TD error and learning rate (alpha).
        best_next_action = np.argmax(self.q_table[str(next_state)])  # Find the best action for the next state
        td_target = reward + self.gamma * self.q_table[str(next_state)][best_next_action]  # Compute target Q-value
        td_error = td_target - self.q_table[str(state)][action]  # Calculate TD error
        self.q_table[str(state)][action] += self.alpha * td_error  # Update the Q-value for the state-action pair


In [4]:
def main():
    # Example data
    # Create a DataFrame 'df' with columns:
    # - 'Target_Returns': Binary target indicating whether the return is positive (1) or not (0).
    # - '1_d_returns': Daily return values.
    df = pd.DataFrame({
        'Target_Returns': [1, 0, 1, 1, 0, 1, 0, 1, 1, 0],
        '1_d_returns': [0.062030, -0.038076, 0.050, 0.030, -0.020, 0.062030, -0.038076, 0.050, 0.030, -0.020]
    })

    # Create environment and agent
    # Initialize the custom gym environment 'ReturnEnv' with the DataFrame 'df'.

    
    # Initialize the Q-learning agent 'QLearningAgent' with:
    # - The action space from the environment.
    # - The observation space from the environment.


    # Training loop
    # Set the number of episodes for training.

    
    # Loop through each episode.
    for episode in range(n_episodes):
        # Reset the environment to start a new episode.
        state = env.reset()
        done = False
        
        # Run the episode until it's done.
        while not done:
            # Choose an action based on the current state using the agent.
            action = agent.choose_action(state)
            
            # Take the chosen action and get the next state, reward, and done flag.
            next_state, reward, done, _ = env.step(action)
            
            # Update the agent with the state, action, reward, and next state.
            agent.update(state, action, reward, next_state)
            
            # Move to the next state.
            state = next_state

    # Indicate that training is finished.
    print("Training finished.")

    # Evaluation
    # Reset the environment to start the evaluation.

    done = False
    step = 1

    # Evaluate the agent's performance.
    while not done:
        # Choose an action based on the current state using the agent.
        action = agent.choose_action(state)
        
        # Take the chosen action and get the next state, reward, and done flag.
        next_state, reward, done, _ = env.step(action)
        
        # Print detailed information for each step, including:
        # - Action taken.
        # - Target return for the current step.
        # - Reward received.
        # - Step number.
        # - Current state.
        # - Next state.
        print(f"Action: {action}, Target Return: {env.df['Target_Returns'].values[step-1]}, Reward: {reward}, Step: {step}, State: {state}, Next State: {next_state}")
        
        # Move to the next state.

        step += 1

# Run the main function if this script is executed directly.
if __name__ == "__main__":
    main()


Training finished.
Action: 1, Target Return: 1, Reward: 1, Step: 1, State: [0.06203], Next State: [-0.038076]
Action: 0, Target Return: 0, Reward: 1, Step: 2, State: [-0.038076], Next State: [0.05]
Action: 1, Target Return: 1, Reward: 1, Step: 3, State: [0.05], Next State: [0.03]
Action: 1, Target Return: 1, Reward: 1, Step: 4, State: [0.03], Next State: [-0.02]
Action: 0, Target Return: 0, Reward: 1, Step: 5, State: [-0.02], Next State: [0.06203]
Action: 1, Target Return: 1, Reward: 1, Step: 6, State: [0.06203], Next State: [-0.038076]
Action: 0, Target Return: 0, Reward: 1, Step: 7, State: [-0.038076], Next State: [0.05]
Action: 1, Target Return: 1, Reward: 1, Step: 8, State: [0.05], Next State: [0.03]
Action: 1, Target Return: 1, Reward: 1, Step: 9, State: [0.03], Next State: [-0.02]
Action: 0, Target Return: 0, Reward: 1, Step: 10, State: [-0.02], Next State: [0. 0.]
