In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from collections import deque
import random

# Re-define the StockTradingEnv class (as it was in the previous successful execution)
class StockTradingEnv:
    def __init__(self, df, initial_cash, buy_min_percent, buy_max_percent,
                 sell_min_percent, sell_max_percent, transaction_fee_percent,
                 transaction_session_limit, transaction_penalty,
                 total_assets_loss_threshold, cash_loss_threshold,
                 win_condition_total_assets):
        self.df = df
        self.initial_cash = initial_cash
        self.cash = initial_cash
        self.shares = 0
        self.buy_min_percent = buy_min_percent
        self.buy_max_percent = buy_max_percent
        self.sell_min_percent = sell_min_percent
        self.sell_max_percent = sell_max_percent
        self.transaction_fee_percent = transaction_fee_percent
        self.transaction_session_limit = transaction_session_limit
        self.transaction_penalty = transaction_penalty
        self.total_assets_loss_threshold = total_assets_loss_threshold
        self.cash_loss_threshold = cash_loss_threshold
        self.win_condition_total_assets = win_condition_total_assets
        self.current_step = 0
        self.no_transaction_count = 0
        self.done = False
        self.reward = 0

    def reset(self):
        self.cash = self.initial_cash
        self.shares = 0
        self.current_step = 0
        self.no_transaction_count = 0
        self.done = False
        self.reward = 0
        return self._get_state()

    def _get_state(self):
        if self.current_step < len(self.df):
            current_price = self.df.iloc[self.current_step]['Close']
        else:
            current_price = self.df.iloc[len(self.df) - 1]['Close'] # Use last known price if episode ended
        # Normalize state features for better neural network performance
        # It's good practice to normalize state features, e.g., by dividing by typical max values
        # For simplicity, let's keep them raw for now, but note this as an improvement
        return np.array([self.cash, self.shares, current_price], dtype=np.float32)

    def _calculate_total_assets(self):
        if self.current_step < len(self.df):
            current_price = self.df.iloc[self.current_step]['Close']
        else:
            current_price = self.df.iloc[len(self.df) - 1]['Close'] # Use last known price if episode ended
        return self.cash + (self.shares * current_price)

    def step(self, action):
        if self.done or self.current_step >= len(self.df):
            self.done = True
            return self._get_state(), self.reward, self.done, {}

        current_price = self.df.iloc[self.current_step]['Close']
        
        # Ensure current_price is valid to prevent division by zero or errors
        if current_price <= 0:
            self.reward = -5 # Penalty for invalid price data, or potentially end episode
            self.done = True
            return self._get_state(), self.reward, self.done, {"message": "Invalid price data"}

        previous_total_assets = self._calculate_total_assets() # Calculate before action

        # Apply no-transaction penalty
        if action == 0: # Hold
            self.no_transaction_count += 1
            if self.no_transaction_count >= self.transaction_session_limit:
                self.cash -= self.transaction_penalty
                self.no_transaction_count = 0 # Reset count after applying penalty
            self.reward = -0.001 # Small penalty for holding to encourage action
        else:
            self.no_transaction_count = 0

        if action == 1:  # Buy
            buy_amount_min = self.cash * self.buy_min_percent
            buy_amount_max = self.cash * self.buy_max_percent
            buy_value = min(buy_amount_max, self.cash)

            if buy_value >= buy_amount_min and self.cash > 0:
                shares_to_buy = int(buy_value / current_price)
                cost = shares_to_buy * current_price
                transaction_fee = cost * self.transaction_fee_percent
                if self.cash >= (cost + transaction_fee) and shares_to_buy > 0:
                    self.cash -= (cost + transaction_fee)
                    self.shares += shares_to_buy
                    self.reward = 0.1 # Small positive reward for buying
                else:
                    self.reward = -0.01 # Small penalty for failed transaction (e.g., not enough cash)
            else:
                self.reward = -0.01 # Small penalty for not meeting buy_min_percent or no cash

        elif action == 2:  # Sell
            sell_amount_min_shares = self.shares * self.sell_min_percent
            sell_amount_max_shares = self.shares * self.sell_max_percent
            shares_to_sell = min(int(sell_amount_max_shares), self.shares)

            if shares_to_sell >= sell_amount_min_shares and self.shares > 0:
                revenue = shares_to_sell * current_price
                transaction_fee = revenue * self.transaction_fee_percent
                self.cash += (revenue - transaction_fee)
                self.shares -= shares_to_sell
                self.reward = 0.1 # Small positive reward for selling
            else:
                self.reward = -0.01 # Small penalty for not meeting sell_min_percent or no shares

        self.current_step += 1

        # Check for termination conditions *after* incrementing step
        if self.current_step >= len(self.df):
            self.done = True
            # Reward for end of episode based on final total assets
            final_total_assets = self._calculate_total_assets()
            self.reward += (final_total_assets - self.initial_cash) / self.initial_cash * 10 # Scale reward based on profit/loss
        
        # Check win/loss conditions (only if not already done by end of data)
        if not self.done:
            total_assets = self._calculate_total_assets()
            if total_assets >= self.win_condition_total_assets:
                self.reward = 100 # Large positive reward for winning
                self.done = True
            elif total_assets < self.total_assets_loss_threshold or self.cash < self.cash_loss_threshold:
                self.reward = -100 # Large negative reward for losing
                self.done = True
        
        next_state = self._get_state()
        return next_state, self.reward, self.done, {}

# --- DQN Agent Implementation ---

class DQNAgent:
    def __init__(self, state_size, action_size, learning_rate=0.001, discount_factor=0.95,
                 epsilon_start=1.0, epsilon_end=0.01, epsilon_decay_steps=5000,
                 replay_buffer_size=10000, batch_size=32):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor  # Gamma
        self.epsilon = epsilon_start            # Exploration rate
        self.epsilon_min = epsilon_end
        self.epsilon_decay_rate = (epsilon_start - epsilon_end) / epsilon_decay_steps

        self.replay_buffer = deque(maxlen=replay_buffer_size)
        self.batch_size = batch_size

        self.model = self._build_model()
        self.target_model = self._build_model()
        self.update_target_model() # Copy weights to target model

    def _build_model(self):
        # Simple feedforward neural network
        model = tf.keras.models.Sequential([
            tf.keras.layers.Dense(64, activation='relu', input_shape=(self.state_size,)),
            tf.keras.layers.Dense(64, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),
                      loss='mse')
        return model

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())

    def remember(self, state, action, reward, next_state, done):
        self.replay_buffer.append((state, action, reward, next_state, done))

    def choose_action(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        # Reshape state for model prediction (add batch dimension)
        q_values = self.model.predict(np.expand_dims(state, axis=0), verbose=0)[0]
        return np.argmax(q_values)

    def learn(self):
        if len(self.replay_buffer) < self.batch_size:
            return # Not enough samples to learn

        minibatch = random.sample(self.replay_buffer, self.batch_size)

        states = np.array([t[0] for t in minibatch])
        actions = np.array([t[1] for t in minibatch])
        rewards = np.array([t[2] for t in minibatch])
        next_states = np.array([t[3] for t in minibatch])
        dones = np.array([t[4] for t in minibatch])

        # Predict Q-values for current states
        current_q_values = self.model.predict(states, verbose=0)
        # Predict Q-values for next states using target model
        next_q_values_target = self.target_model.predict(next_states, verbose=0)

        # Calculate target Q-values
        target_q_values = np.copy(current_q_values)
        for i in range(self.batch_size):
            if dones[i]:
                target_q_values[i][actions[i]] = rewards[i]
            else:
                # Bellman equation: Q(s,a) = r + gamma * max(Q(s',a'))
                target_q_values[i][actions[i]] = rewards[i] + self.discount_factor * np.amax(next_q_values_target[i])

        # Train the model
        self.model.fit(states, target_q_values, epochs=1, verbose=0)

        # Decay epsilon
        if self.epsilon > self.epsilon_min:
            self.epsilon -= self.epsilon_decay_rate

# --- Main Training Loop ---

# Load the stock data
stock_df = pd.read_csv('stock_2y.csv')

# --- Environment Parameters (from previous problem) ---
initial_cash = 100000
buy_min_percent = 0.05
buy_max_percent = 1.00
sell_min_percent = 0.05
sell_max_percent = 1.00
transaction_fee_percent = 0.001
transaction_session_limit = 5
transaction_penalty = 100
total_assets_loss_threshold = 10000
cash_loss_threshold = -5000
win_condition_total_assets = 1000000

env = StockTradingEnv(stock_df, initial_cash, buy_min_percent, buy_max_percent,
                      sell_min_percent, sell_max_percent, transaction_fee_percent,
                      transaction_session_limit, transaction_penalty,
                      total_assets_loss_threshold, cash_loss_threshold,
                      win_condition_total_assets)

state_size = env._get_state().shape[0] # Number of features in the state
action_size = 3 # Hold, Buy, Sell

agent = DQNAgent(state_size=state_size, action_size=action_size,
                 learning_rate=0.0005, discount_factor=0.99, # Adjusted learning rate and discount for stability
                 epsilon_start=1.0, epsilon_end=0.01, epsilon_decay_steps=50000, # More steps for decay
                 replay_buffer_size=50000, batch_size=64) # Larger buffer and batch size

num_episodes = 200 # Increased episodes for some learning to occur
target_model_update_freq = 10 # Update target network every N episodes

print("--- Starting DQN Training ---")

for episode in range(num_episodes):
    state = env.reset()
    episode_reward = 0
    done = False
    step_count = 0

    while not done:
        action = agent.choose_action(state)
        next_state, reward, done, _ = env.step(action)
        
        agent.remember(state, action, reward, next_state, done)
        agent.learn() # Learn after each step (or every few steps)

        state = next_state
        episode_reward += reward
        step_count += 1
    
    # Update target model weights periodically
    if episode % target_model_update_freq == 0:
        agent.update_target_model()

    final_total_assets = env._calculate_total_assets()
    print(f"Episode {episode + 1}/{num_episodes}: "
          f"Final Assets = {final_total_assets:,.2f} USD, "
          f"Reward = {episode_reward:,.2f}, "
          f"Epsilon = {agent.epsilon:.4f}, "
          f"Steps = {step_count}")

    # Optional: Save model weights after a certain performance or regularly
    # if final_total_assets >= win_condition_total_assets:
    #     print(f"Goal achieved in episode {episode + 1}! Saving model...")
    #     agent.model.save(f"stock_trader_dqn_win_ep{episode + 1}.h5")
    #     break # End training if goal is met

print("\n--- DQN Training Complete ---")

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


--- Starting DQN Training ---
Episode 1/200: Final Assets = 135,567.06 USD, Reward = 11.31, Epsilon = 0.9913, Steps = 502
Episode 2/200: Final Assets = 88,899.71 USD, Reward = 6.95, Epsilon = 0.9814, Steps = 502
Episode 3/200: Final Assets = 119,958.06 USD, Reward = 7.66, Epsilon = 0.9714, Steps = 502
Episode 4/200: Final Assets = 87,818.38 USD, Reward = 8.20, Epsilon = 0.9615, Steps = 502
Episode 5/200: Final Assets = 109,508.02 USD, Reward = 10.07, Epsilon = 0.9515, Steps = 502
Episode 6/200: Final Assets = 102,982.24 USD, Reward = 6.29, Epsilon = 0.9416, Steps = 502
Episode 7/200: Final Assets = 104,707.49 USD, Reward = 8.29, Epsilon = 0.9317, Steps = 502
Episode 8/200: Final Assets = 95,775.45 USD, Reward = 6.72, Epsilon = 0.9217, Steps = 502
Episode 9/200: Final Assets = 77,074.96 USD, Reward = 6.84, Epsilon = 0.9118, Steps = 502
Episode 10/200: Final Assets = 149,617.68 USD, Reward = 11.34, Epsilon = 0.9019, Steps = 502
Episode 11/200: Final Assets = 113,755.16 USD, Reward = 9.24

KeyboardInterrupt: 