In [31]:
import pandas as pd
import numpy as np
class QLearningTrader:
    def __init__(self, num_actions, learning_rate, discount_factor, exploration_prob, state_bins):
        self.num_actions = num_actions
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.exploration_prob = exploration_prob
        self.state_bins = state_bins
        self.num_states = len(state_bins) + 1
        self.q_table = np.zeros((self.num_states, num_actions))
        self.train_log = []  # Initialize an empty list to log actions during training
        self.test_log = []  # Initialize an empty list to log actions during testing
        self.predict_log = []  # Initialize an empty list to log predictions

    def discretize_state(self, state_value):
        """Discretize a continuous state into a discrete state."""
        return np.digitize(state_value, self.state_bins)

    def choose_action(self, discrete_state):
        if np.random.rand() < self.exploration_prob:
            action = np.random.choice(self.num_actions)
        else:
            action = np.argmax(self.q_table[discrete_state])
        return action

    def update_q_table(self, current_state, action, reward, next_state):
        current_q = self.q_table[current_state, action]
        future_q = np.max(self.q_table[next_state])
        new_q = (1 - self.learning_rate) * current_q + self.learning_rate * (reward + self.discount_factor * future_q)
        self.q_table[current_state, action] = new_q

    def log_action(self, log_type, time_step, current_state, action, reward, next_state):
        """Logs the details of an action taken by the trader."""
        log_entry = {
            'time_step': time_step,
            'current_state': current_state,
            'action': action,
            'reward': reward,
            'next_state': next_state
        }
        if log_type == 'train':
            self.train_log.append(log_entry)
        elif log_type == 'test':
            self.test_log.append(log_entry)

    def take_action(self, current_close, next_close):
        """Example reward calculation."""
        if next_close > current_close:
            return 1  # Profit
        else:
            return -1  # Loss

    def train(self, historical_data):
        for i in range(len(historical_data) - 1):
            current_close = historical_data.iloc[i]['close']
            next_close = historical_data.iloc[i + 1]['close']
            current_state = self.discretize_state(current_close)
            next_state = self.discretize_state(next_close)

            action = self.choose_action(current_state)
            reward = self.take_action(current_close, next_close)
            self.update_q_table(current_state, action, reward, next_state)
            self.log_action('train', i, current_state, action, reward, next_state)

    def evaluate(self, test_data):
        total_profit = 0
        for i in range(len(test_data) - 1):
            current_close = test_data.iloc[i]['close']
            next_close = test_data.iloc[i + 1]['close']
            current_state = self.discretize_state(current_close)
            next_state = self.discretize_state(next_close)

            action = self.choose_action(current_state)
            reward = self.take_action(current_close, next_close)
            total_profit += reward

            self.log_action('test', i, current_state, action, reward, next_state)

        print(f"Total Profit: {total_profit}")
        
    def log_prediction(self, current_state, action):
        """
        Logs details of predictions.
        
        :param current_state: The current discretized state.
        :param action: The action chosen based on the current state.
        """
        self.predict_log.append({
            'current_state': current_state,
            'action': action
        })

    def predict_action(self, current_data_df):
        """
        Predicts an action for a single row of incoming data, where the data is provided as a DataFrame.
        
        :param current_data_df: A DataFrame containing a single row of incoming data.
        :return: The action chosen based on the current state.
        """
        # Ensure the DataFrame contains exactly one row
        if len(current_data_df) != 1:
            raise ValueError("DataFrame must contain exactly one row of data.")

        # Extract the 'close' price from the DataFrame
        current_close = current_data_df['close'].iloc[0]

        # Discretize the current state based on the 'close' price
        current_state = self.discretize_state(current_close)

        # Choose an action based on the current state using the policy defined by the Q-table
        action = self.choose_action(current_state)
        
        # Log the prediction
        self.log_prediction(current_state, action)
        
        return action
    
    def calculate_reward(self, current_close, next_close, action):
        """
        Calculates the reward for an action based on price movement.

        :param current_close: The closing price at the current state.
        :param next_close: The closing price at the next state.
        :param action: The action taken.
        :return: The calculated reward.
        """
        # Example reward calculation
        if action == 0:  # Assuming 0 is 'buy'
            reward = next_close - current_close  # Profit if the price goes up
        elif action == 1:  # Assuming 1 is 'sell'
            reward = current_close - next_close  # Profit if the price goes down
        else:  # Assuming 2 is 'hold' or any other non-trading action
            reward = 0  # No profit or loss
        return reward

    def online_update(self, current_data_df, next_data_df):
        """
        Updates the Q-table based on a new single-row observation.

        :param current_data_df: DataFrame with the current observation.
        :param next_data_df: DataFrame with the next observation for determining the outcome.
        """
        if len(current_data_df) != 1 or len(next_data_df) != 1:
            raise ValueError("Each DataFrame must contain exactly one row of data.")

        current_close = current_data_df['close'].iloc[0]
        next_close = next_data_df['close'].iloc[0]

        # Retrieve the last action predicted, assuming it's stored or logged
        if not self.predict_log:
            raise ValueError("No prior action to update from; ensure a prediction was made.")
        last_prediction = self.predict_log[-1]
        current_state = last_prediction['current_state']
        action = last_prediction['action']

        # Calculate the reward based on the action taken and the observed price movement
        reward = self.calculate_reward(current_close, next_close, action)

        next_state = self.discretize_state(next_close)

        # Update the Q-table
        self.update_q_table(current_state, action, reward, next_state)

    def get_predict_log(self):
        """Returns the log of all predictions made."""
        return pd.DataFrame(self.predict_log)

    def get_train_log(self):
        """Returns the log of all training actions taken."""
        return pd.DataFrame(self.train_log)

    def get_test_log(self):
        """Returns the log of all testing actions taken."""
        return pd.DataFrame(self.test_log)


In [32]:
import oandapyV20
import oandapyV20.endpoints.instruments as instruments
from dotenv import load_dotenv
import os
load_dotenv()

True

In [33]:
client = oandapyV20.API(access_token=os.getenv("OANDA_ACCESS_TOKEN"))

params = {'granularity': 'M10', 'count': 500}
r = instruments.InstrumentsCandles(instrument="USD_JPY",
                                   params=params)
client.request(r)

data = [{'time': d['time'], 
         'h': d['mid']['h'], 
         'c': d['mid']['c'], 
         'l': d['mid']['l'], 
         'o': d['mid']['o'], 
         'volume': d['volume']} 
        for d in r.response['candles']]

# Create a DataFrame from the list of dictionaries
df = pd.DataFrame(data)
df = (df
      .set_axis(['time', 'high', 'close', 'low', 'open', 'volume'], axis=1)
      .assign(time= lambda x: pd.to_datetime(x['time']))
)
df[["high","low","close","open","volume"]] = df[["high","low","close","open","volume"]].apply(pd.to_numeric)

In [34]:
train_test_split_index = int(0.8  * df.shape[0])
train = df.iloc[:train_test_split_index]
test = df.iloc[train_test_split_index:]

In [35]:
df.tail()

Unnamed: 0,time,high,close,low,open,volume
495,2024-03-29 20:10:00+00:00,151.359,151.359,151.346,151.346,20
496,2024-03-29 20:20:00+00:00,151.368,151.359,151.359,151.36,20
497,2024-03-29 20:30:00+00:00,151.373,151.362,151.351,151.358,129
498,2024-03-29 20:40:00+00:00,151.384,151.373,151.347,151.362,246
499,2024-03-29 20:50:00+00:00,151.386,151.342,151.333,151.369,198


In [37]:
trader = QLearningTrader(num_actions=3, learning_rate=0.1, discount_factor=0.95, exploration_prob=0.1, state_bins=np.linspace(-0.01, 0.01, num=10))
trader.train(train)
trader.evaluate(train)
trader.evaluate(test)

Total Profit: 9
Total Profit: 7


In [42]:
trader.get_test_log()

Unnamed: 0,time_step,current_state,action,reward,next_state
0,0,10,1,1,10
1,1,10,1,1,10
2,2,10,1,1,10
3,3,10,1,-1,10
4,4,10,1,-1,10
...,...,...,...,...,...
493,94,10,1,1,10
494,95,10,1,-1,10
495,96,10,1,1,10
496,97,10,1,1,10


In [20]:
test["action"] = trader.get_test_log()["action"].tolist() + [None]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["action"] = trader.get_test_log()["action"].tolist() + [None]


In [38]:
train["action"] = trader.get_train_log()["action"].tolist() + [None]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["action"] = trader.get_train_log()["action"].tolist() + [None]


In [43]:
train["market_return"] = train["close"].pct_change()
train["strategy_return"] = train["market_return"] * train["action"]
train["strategy_return"].cumsum()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["market_return"] = train["close"].pct_change()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train["strategy_return"] = train["market_return"] * train["action"]


0           NaN
1      0.000000
2      0.000000
3      0.000000
4      0.000000
         ...   
395    0.001420
396    0.001486
397    0.001486
398    0.001513
399         NaN
Name: strategy_return, Length: 400, dtype: float64

In [40]:
test["market_return"] = test["close"].pct_change()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["market_return"] = test["close"].pct_change()


In [22]:
test["strategy_return"] = test["market_return"] * test["action"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test["strategy_return"] = test["market_return"] * test["action"]


In [23]:
test["strategy_return"].cumsum() * 1000000

400           NaN
401    -79.311576
402   -396.570459
403   -185.030980
404   -158.591342
          ...    
495    292.185585
496    292.185585
497    331.826439
498    477.173356
499           NaN
Name: strategy_return, Length: 100, dtype: float64

In [24]:
import pandas as pd

# Simulating incoming data as two consecutive data points
current_data_df = pd.DataFrame({'close': [151.384]}, index=[0])  # Current market condition
next_data_df = pd.DataFrame({'close': [151.263]}, index=[0])  # Next market condition (for outcome evaluation)


In [25]:
predicted_action = trader.predict_action(current_data_df)
action_dict = {0: 'buy', 1: 'sell', 2: 'hold'}
print(f"Predicted action based on the current data: {action_dict[predicted_action]}")

Predicted action based on the current data: hold


In [26]:
trader.online_update(current_data_df, next_data_df)

In [27]:
trader.calculate_reward(151.384, 151.263, predicted_action)

0