In [None]:
import numpy as np
import pandas as pd
import random

# Define a simple trading environment for stock data.
class TradingEnv:
    def __init__(self, csv_path, threshold=0.001):
        """
        Args:
          csv_path: Path to CSV file with historical stock data (must contain a 'Close' column).
          threshold: Minimum absolute daily return to classify trend as 'up' or 'down'
        """
        self.df = pd.read_csv(csv_path, parse_dates=['Date'])
        self.df.sort_values('Date', inplace=True)
        self.prices = self.df['Close'].values
        self.n_days = len(self.prices)
        self.threshold = threshold
        self.reset()

    def reset(self):
        # Start at day 1 (so we can compute return from day 0)
        self.current_day = 1
        self.holding = 0       # 0: not holding, 1: holding
        self.buy_price = None  # Record the buy price when position is taken
        return self._get_state()

    def _get_trend(self):
        # Compute return from previous day
        ret = (self.prices[self.current_day] - self.prices[self.current_day - 1]) / self.prices[self.current_day - 1]
        if ret > self.threshold:
            return 'up'
        elif ret < -self.threshold:
            return 'down'
        else:
            return 'stable'

    def _get_state(self):
        # State is (trend, holding). Trend: 'up', 'down', 'stable'; holding: 0 or 1.
        trend = self._get_trend()
        return (trend, self.holding)

    def step(self, action):
        """
        Actions:
          0: Buy (if not holding)
          1: Sell (if holding)
          2: Hold
        Returns:
          next_state, reward, done
        """
        reward = 0
        # Action restrictions: if not holding, only buy or hold; if holding, only sell or hold.
        if self.holding == 0:
            if action == 0:  # Buy
                self.holding = 1
                self.buy_price = self.prices[self.current_day]
            # else: hold: do nothing
        else:  # holding == 1
            if action == 1:  # Sell
                sell_price = self.prices[self.current_day]
                reward = sell_price - self.buy_price  # profit (can be negative)
                self.holding = 0
                self.buy_price = None
            # else: hold, do nothing

        # Advance one day
        self.current_day += 1
        done = self.current_day >= self.n_days
        next_state = self._get_state() if not done else None
        return next_state, reward, done

# Define a Q-learning agent for the trading environment.
class QLearningAgent:
    def __init__(self, actions, alpha=0.1, gamma=0.95, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995):
        """
        Args:
          actions: list of possible actions
          alpha: learning rate
          gamma: discount factor
          epsilon: initial exploration rate
          epsilon_min: minimum exploration rate
          epsilon_decay: multiplicative decay factor per episode
        """
        self.actions = actions
        self.alpha = alpha
        self.gamma = gamma
        self.epsilon = epsilon
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        # Q-table: keys will be state tuples, values are arrays for each action.
        self.Q = {}

    def get_Q(self, state):
        # Return Q-values for state; if state not in Q, initialize with zeros.
        if state not in self.Q:
            self.Q[state] = np.zeros(len(self.actions))
        return self.Q[state]

    def choose_action(self, state):
        if np.random.rand() < self.epsilon:
            return np.random.choice(self.actions)
        Qs = self.get_Q(state)
        return self.actions[np.argmax(Qs)]

    def learn(self, state, action, reward, next_state, done):
        a_index = self.actions.index(action)
        Qs = self.get_Q(state)
        q_predict = Qs[a_index]
        if not done:
            q_target = reward + self.gamma * np.max(self.get_Q(next_state))
        else:
            q_target = reward
        self.Q[state][a_index] += self.alpha * (q_target - q_predict)
        if done:
            self.epsilon = max(self.epsilon * self.epsilon_decay, self.epsilon_min)

def train_q_learning(env, agent, episodes=1000):
    rewards_all = []
    for ep in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False
        while not done:
            # Choose valid action based on state holding:
            # If not holding: available actions are [Buy, Hold] i.e. [0, 2]
            # If holding: available actions are [Sell, Hold] i.e. [1, 2]
            if state[1] == 0:
                valid_actions = [0, 2]
            else:
                valid_actions = [1, 2]
            action = random.choice(valid_actions) if np.random.rand() < agent.epsilon else \
                     valid_actions[np.argmax([agent.get_Q(state)[agent.actions.index(a)] for a in valid_actions])]
            next_state, reward, done = env.step(action)
            agent.learn(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward
        rewards_all.append(total_reward)
        if (ep+1) % 100 == 0:
            print(f"Episode {ep+1}/{episodes}: Total Reward = {total_reward:.2f}, Epsilon = {agent.epsilon:.3f}")
    return agent, rewards_all
import pickle
def main():
    # Initialize the trading environment with your stock CSV.
    env = TradingEnv('/Users/devshah/Documents/WorkSpace/University/year 3/CSC392/Trading_Simulator/data/updated_data.csv', threshold=0.001)
    actions = [0, 1, 2]  # 0: Buy, 1: Sell, 2: Hold
    agent = QLearningAgent(actions, alpha=0.1, gamma=0.95, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995)
    
    # Train the Q-learning agent.
    trained_agent, rewards = train_q_learning(env, agent, episodes=1000)
    
    print("Learned Q-table:")
    for state, q_values in trained_agent.Q.items():
        print(f"State {state}: {q_values}")
    
    # Export the trained Q-learning model (agent)
    with open('q_learning_agent.pkl', 'wb') as f:
        pickle.dump(trained_agent, f)
    print("Q-learning agent saved as 'q_learning_agent.pkl'")

if __name__ == "__main__":
    main()



Episode 100/1000: Total Reward = 749260.71, Epsilon = 0.606
Episode 200/1000: Total Reward = 1531085.36, Epsilon = 0.367
Episode 300/1000: Total Reward = 2250975.90, Epsilon = 0.222
Episode 400/1000: Total Reward = 2449899.80, Epsilon = 0.135
Episode 500/1000: Total Reward = 2978400.68, Epsilon = 0.082
Episode 600/1000: Total Reward = 3582881.63, Epsilon = 0.049
Episode 700/1000: Total Reward = 3651409.49, Epsilon = 0.030
Episode 800/1000: Total Reward = 3816929.28, Epsilon = 0.018
Episode 900/1000: Total Reward = 4070725.70, Epsilon = 0.011
Episode 1000/1000: Total Reward = 3715319.01, Epsilon = 0.010
Learned Q-table:
State ('down', 0): [1466.88981834    0.         1412.6953365 ]
State ('down', 1): [   0.          728.65711922 1492.64620229]
State ('up', 0): [1263.8813019     0.         1411.35175266]
State ('up', 1): [   0.         1305.6225791  1404.55902987]
State ('stable', 1): [   0.          691.36638923 1409.46904481]
State ('stable', 0): [1108.00698101    0.          868.30137

NameError: name 'trained_agent' is not defined