In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

In [2]:
df = pd.read_csv('/home/amy/work/RIT/TDess/DSCI-601-Amy/Data/Combined/combined_AAPL.csv')

In [3]:
df

Unnamed: 0,date,RET,VOL_CHANGE,BA_SPREAD,ILLIQUIDITY,sprtrn,TURNOVER,DJI_Return
0,1/2/1992,0.055432,0.717745,0.008403,4.510000e-10,0.000408,17.419850,0.000000
1,1/3/1992,-0.008403,-0.172890,0.004237,-8.340000e-11,0.004985,14.408127,0.009173
2,1/6/1992,-0.016949,-0.399632,0.004310,-2.850000e-10,-0.003291,8.650181,-0.000437
3,1/7/1992,0.019397,0.237283,0.004228,2.590000e-10,-0.001340,10.702726,0.001469
4,1/8/1992,0.023256,0.645321,0.004132,1.840000e-10,0.001677,17.609419,-0.000281
...,...,...,...,...,...,...,...,...
7804,12/23/2022,-0.002798,-0.181476,0.000076,-3.330000e-13,0.005868,4.008909,0.005342
7805,12/27/2022,-0.013878,0.081093,0.000231,-1.550000e-12,-0.004050,4.334004,0.001133
7806,12/28/2022,-0.030685,0.238299,0.000079,-2.850000e-12,-0.012021,5.366792,-0.011006
7807,12/29/2022,0.028324,-0.115337,0.000231,2.890000e-12,0.017461,4.747802,0.010497


# Pending

In [7]:
import numpy as np
import random

class QLearningAgent:
    def __init__(self, n_actions, n_states, learning_rate=0.1, discount_factor=0.99, epsilon=0.1):
        self.q_table = np.zeros((n_states, n_actions))
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.n_actions = n_actions
        self.n_states = n_states

    def choose_action(self, state):
        if np.random.uniform(0, 1) < self.epsilon:
            return np.random.choice(self.n_actions)
        else:
            return np.argmax(self.q_table[state])

    def update_q_table(self, state, action, reward, next_state):
        best_next_action = np.argmax(self.q_table[next_state])
        td_target = reward + self.discount_factor * self.q_table[next_state, best_next_action]
        td_error = td_target - self.q_table[state, action]
        self.q_table[state, action] += self.learning_rate * td_error

class StockTradingEnvironment:
    def __init__(self, data, initial_balance=1000):
        self.data = data
        self.current_step = 0
        self.balance = initial_balance
        self.shares_held = 0

    def step(self, action):
        current_price = 1  # Assuming price is normalized for simplicity
        next_price = 1  # Static price for simplicity
        reward = 0
        
        # Actions: 0 = Buy, 1 = Sell, 2 = Hold
        if action == 0:  # Buy
            if self.balance >= current_price:
                self.shares_held += 1
                self.balance -= current_price
                reward = self.data.iloc[self.current_step]['RET']
        elif action == 1:  # Sell
            if self.shares_held > 0:
                self.shares_held -= 1
                self.balance += next_price
                reward = -self.data.iloc[self.current_step]['RET']

        self.current_step += 1
        next_state = self.get_state(self.current_step)
        done = self.current_step >= len(self.data) - 1
        return next_state, reward, done, self.balance

    def get_state(self, step):
        row = self.data.iloc[step]
        features = ['VOL_CHANGE', 'BA_SPREAD', 'ILLIQUIDITY', 'sprtrn', 'TURNOVER', 'DJI_Return']
        state = 0
        for i, feature in enumerate(features):
            state += (row[feature] > self.data[feature].median()) * (2 ** i)
        return state

    def reset(self):
        self.current_step = 0
        self.balance = 1000  # Reset balance
        self.shares_held = 0
        return self.get_state(self.current_step)

# Load your dataset
data = df  # Provide your DataFrame here

# Initialize the agent and environment
env = StockTradingEnvironment(data)
n_states = 2 ** len(['VOL_CHANGE', 'BA_SPREAD', 'ILLIQUIDITY', 'sprtrn', 'TURNOVER', 'DJI_Return'])  # Assuming binary state for simplicity
agent = QLearningAgent(n_actions=3, n_states=n_states)

# Run simulation
state = env.reset()
done = False
total_balance = []
while not done:
    action = agent.choose_action(state)
    next_state, reward, done, balance = env.step(action)
    agent.update_q_table(state, action, reward, next_state)
    state = next_state
    total_balance.append(balance)

# Analyze performance
print("Simulation finished.")
print("Final balance:", total_balance[-1])


Simulation finished.
Final balance: 0


In [None]:
import gym
import numpy as np
from stable_baselines3 import A2C
from stable_baselines3.common.env_checker import check_env
from gym import spaces

class StockTradingEnv(gym.Env):
    """A stock trading environment for OpenAI gym"""
    metadata = {'render.modes': ['human']}

    def __init__(self, df):
        super(StockTradingEnv, self).__init__()

        self.df = df
        self.reward_range = (0, max(df['RET']))
        
        # Actions of the format Buy x%, Sell x%, Hold, etc.
        self.action_space = spaces.Box(low=np.array([0, 0]), high=np.array([3, 1]), dtype=np.float16)
        
        # Prices contains the OHLC values for the last five prices
        self.observation_space = spaces.Box(low=0, high=1, shape=(6,), dtype=np.float16)
        
    def _next_observation(self):
        # Get the data points for the last 5 days and scale to between 0-1
        frame = np.array([
            self.df.loc[self.current_step: self.current_step + 5, 'RET'].values / max(self.df['RET']),
            self.df.loc[self.current_step: self.current_step + 5, 'VOL_CHANGE'].values / max(self.df['VOL_CHANGE']),
            self.df.loc[self.current_step: self.current_step + 5, 'BA_SPREAD'].values / max(self.df['BA_SPREAD']),
            self.df.loc[self.current_step: self.current_step + 5, 'ILLIQUIDITY'].values / max(self.df['ILLIQUIDITY']),
            self.df.loc[self.current_step: self.current_step + 5, 'sprtrn'].values / max(self.df['sprtrn']),
            self.df.loc[self.current_step: self.current_step + 5, 'TURNOVER'].values / max(self.df['TURNOVER']),
        ]).reshape(6,)
        
        return frame
    
    def step(self, action):
        # Not executing properly
        # Execute one time step within the environment
        self.current_step += 1
        
        if self.current_step > len(self.df.loc[:, 'RET'].values) - 6:
            self.current_step = 0
        
        delay_modifier = (self.current_step / max(self.df.index))

        reward = self.df.loc[self.current_step, 'RET'] * delay_modifier
        done = self.df.loc[self.current_step, 'RET'] < 0
        
        obs = self._next_observation()
        
        return obs, reward, done, {}
    
    def reset(self):
        # Reset the state of the environment to an initial state
        self.current_step = 0
        
        return self._next_observation()
    
    def render(self, mode='human', close=False):
        # Render the environment to the screen
        profit = self.df.loc[self.current_step, 'RET']

        print(f'Step: {self.current_step}')
        print(f'Profit: {profit}')
        
# load data
env = StockTradingEnv(df)

# Check if the environment follows gym interface
check_env(env, warn=True)

model = A2C('MlpPolicy', env, verbose=1)
model.learn(total_timesteps=10000)
