## Stock trading using Reinforcement Learning (Q-learning) 
In this program we will make an agent to deal with stock market. For this we will use Q-learning alogrithm.

In [2]:
# Loading the necessary libraries 
import numpy as np
import gym
from gym import spaces
import random
import pandas as pd

We will now define the trading environment.

In [20]:
class TradingEnv(gym.Env):
    def __init__(self,df):
        super().__init__()
        self.df = df
        self.reward_range = (-np.inf,np.inf) # Reward ranges from negative infinity to positive infinity 
        self.action_space = spaces.Discrete(3) # 0: hold, 1: buy, 2: sell
        self.observation_space = spaces.Box(low = -np.inf, high = np.inf, shape = (df.shape[1],), dtype= np.float32)

        self.current_step = 0 
        self.done = False
        self.positions = [] # No positions hold in the beginning
        self.current_price = 0
        self.cash_in_hand = 1000 # Cash with what we allow to start the agent

    def _next_observation(self):
        return self.df.iloc[self.current_step].values

    def step(self,action):
        reward = 0 # Initialize reward
        self.current_price = self.df.loc[self.current_step,"stock_price"]

        # Buy action
        if action == 1:
            self.positions.append(self.current_price)
            self.cash_in_hand -= self.current_price # Deduct the money of current stock 

        # Sell action
        elif action==2 and len(self.positions)>0:
            bought_price = self.positions.pop(0)
            self.cash_in_hand += self.current_price
            reward = self.current_price - bought_price

        # Hold or invalid sell
        else:
            self.current_step +=1 
            if self.current_step >= len(self.df): # If we have reached to the last of the data 
                self.done = True 

        info = {}
        next_state = self._next_observation() if not self.done else np.zeros(self.df.shape[1])
        return next_state, reward, self.done, info

    # Define a function for reset
    def reset(self):
        self.current_step = 0 
        self.done = False
        self.positions = []
        self.current_price = self.df.loc[self.current_step, "stock_price"]
        self.cash_in_hand = 1000
        return self._next_observation()

    def render(self,mode="human"):
        profit = self.cash_in_hand - 1000 + sum([self.current_price - p for p in self.positions])
        return f"Step: {self.current_step}, Price: {self.current_price:.2f}, Positions: {len(self.positions)}, Profit: {profit:.2f}"

Next we will define the Q-learning agent.

In [21]:
class QLearningAgent:
    def __init__(self,state_size, action_size, alpha=0.1, gamma=0.6, epsilon = 0.1):
        self.state_size = state_size # State space size
        self.action_size = action_size # Action space size
        self.alpha = alpha # Learning rate 
        self.gamma = gamma # Discount 
        self.epsilon = epsilon # Extrapolation rate 
        self.q_table = np.zeros((state_size, action_size)) # Initializing Q table with zero 

    # Method to choose an action based on the current state 
    def choose_action(self,state_index):
        # With probability epsilon, choose a random number
        if random.uniform(0,1) < self.epsilon:
            return random.choice(range(self.action_size))
        # Otherwise, choose the action with the highest Q-value 
        else:
            return np.argmax(self.q_table[state_index])

   
    # Method to update the Q-table based on agent's experience
    def learn(self,state_index, action, reward, next_state_index, done):
        
        # Get the current Q-value for the state-action pair
        old_value = self.q_table[state_index,action]
        
        # Get the maximum Q-value for the next state
        next_max = np.max(self.q_table[next_state_index])

        # Compute the new Q-value for the state-action pair
        new_value = old_value + self.alpha * (reward + self.gamma * next_max *(1 - int(done)) - old_value)

        # Update the Q-table with the new Q-value
        self.q_table[state_index,action] = new_value

Now we load the dataframe for which we will make the predictions.

In [22]:
# Load the data file 
df = pd.read_csv(r"C:\Users\utkri\OneDrive\Desktop\VG_option_simulation.csv")

Now we initailze the environment and the agent. 

In [23]:
env = TradingEnv(df)
state_size = env.observation_space.shape[0]
action_size = env.action_space.n
agent = QLearningAgent(state_size=state_size, action_size= action_size)

Next we train the agent.

In [28]:
num_episodes = 100
for episode in range(num_episodes):
    state = env.reset()
    state_index = 0 # This should be more meaningfully defined based on the environment's state 

    while True:
        action = agent.choose_action(state_index)
        next_state, reward, done, _ = env.step(action)

        # Ensure next_state_indexes doesn't exceed q_table bounds
        next_state_index = min(state_index + 1, agent.state_size - 1)
        agent.learn(state_index, action, reward, next_state_index, done)

        state_index = next_state_index
        if done:
            break

    print(f'Episode: {episode + 1}/{num_episodes}, Profit: {env.cash_in_hand - 1000 + sum(env.positions)}')

Episode: 1/100, Profit: -7.275957614183426e-12
Episode: 2/100, Profit: -3.637978807091713e-12
Episode: 3/100, Profit: -3.637978807091713e-12
Episode: 4/100, Profit: -3.637978807091713e-12
Episode: 5/100, Profit: 9.094947017729282e-13
Episode: 6/100, Profit: 0.0
Episode: 7/100, Profit: -9.094947017729282e-13
Episode: 8/100, Profit: 0.0
Episode: 9/100, Profit: -3.637978807091713e-12
Episode: 10/100, Profit: -7.275957614183426e-12
Episode: 11/100, Profit: -3.637978807091713e-12
Episode: 12/100, Profit: 0.0
Episode: 13/100, Profit: 7.275957614183426e-12
Episode: 14/100, Profit: -7.275957614183426e-12
Episode: 15/100, Profit: -3.637978807091713e-12
Episode: 16/100, Profit: 1.8189894035458565e-12
Episode: 17/100, Profit: -3.637978807091713e-12
Episode: 18/100, Profit: 0.0
Episode: 19/100, Profit: 0.0
Episode: 20/100, Profit: 7.275957614183426e-12
Episode: 21/100, Profit: 0.0
Episode: 22/100, Profit: -3.637978807091713e-12
Episode: 23/100, Profit: -3.637978807091713e-12
Episode: 24/100, Profi