<a href="https://colab.research.google.com/github/tejasgadgil/RNN-for-KI/blob/main/DQL_Attempt1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install yfinance pandas numpy matplotlib gym stable-baselines3 tensorflow torch


Collecting stable-baselines3
  Downloading stable_baselines3-2.4.0-py3-none-any.whl.metadata (4.5 kB)
Collecting gymnasium<1.1.0,>=0.29.1 (from stable-baselines3)
  Downloading gymnasium-1.0.0-py3-none-any.whl.metadata (9.5 kB)
Collecting farama-notifications>=0.0.1 (from gymnasium<1.1.0,>=0.29.1->stable-baselines3)
  Downloading Farama_Notifications-0.0.4-py3-none-any.whl.metadata (558 bytes)
Downloading stable_baselines3-2.4.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gymnasium-1.0.0-py3-none-any.whl (958 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m958.1/958.1 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Farama_Notifications-0.0.4-py3-none-any.whl (2.5 kB)
Installing collected packages: farama-notifications, gymnasium, stable-baselines3
Successfully installed farama-notifications-0.0.4 gymnasium-1.0.0 stable-baselines3-2.4

In [43]:
import yfinance as yf
import pandas as pd

# Fetch historical stock data
def fetch_stock_data(ticker, start_date, end_date):
    data = yf.download(ticker, start=start_date, end=end_date)
    data.reset_index(inplace=True)  # Resets index and keeps 'Date' as a column
    return data

def normalize_data(data):
    # Normalize all columns except 'Date'
    numeric_columns = data.select_dtypes(include=['float64', 'int64']).columns
    data[numeric_columns] = (data[numeric_columns] - data[numeric_columns].mean()) / data[numeric_columns].std()
    return data

# Calculate Exponential Moving Average (EMA)
def calculate_ema(data, window=10):
    # Calculate Exponential Moving Average for 'Close' column
    data['EMA'] = data['Close'].ewm(span=window, adjust=False).mean()
    return data

# Example: Fetch data for AAPL
stock_data = fetch_stock_data('AAPL', '2020-01-01', '2024-12-06')
stock_data = calculate_ema(stock_data, window=10)  # 10-day EMA
stock_data = normalize_data(stock_data)

print(stock_data.head())


[*********************100%***********************]  1 of 1 completed

Price                       Date Adj Close     Close      High       Low  \
Ticker                                AAPL      AAPL      AAPL      AAPL   
0      2020-01-02 00:00:00+00:00 -1.917658 -1.932197 -1.962489 -1.931091   
1      2020-01-03 00:00:00+00:00 -1.935014 -1.950352 -1.962613 -1.922906   
2      2020-01-06 00:00:00+00:00 -1.920927 -1.935617 -1.966454 -1.946337   
3      2020-01-07 00:00:00+00:00 -1.929308 -1.944383 -1.960630 -1.916782   
4      2020-01-08 00:00:00+00:00 -1.900778 -1.914540 -1.938695 -1.918782   

Price       Open    Volume       EMA  
Ticker      AAPL      AAPL            
0      -1.955601  0.831701 -1.930449  
1      -1.949939  1.035119 -1.933772  
2      -1.970845  0.511010 -1.933794  
3      -1.933202  0.332492 -1.935416  
4      -1.949877  0.767890 -1.931281  





In [56]:
# Creating custom gym environment

import gym
import numpy as np
from gym import spaces

class StockTradingEnv(gym.Env):
    def __init__(self, data, render_mode='human'):
        super(StockTradingEnv, self).__init__()
        self.data = data
        self.current_step = 0
        self.max_steps = len(data) - 1
        self.holding_penalty = 1  # Example value
        self.max_hold_duration = 20  # Example value, in steps
        self.holding_duration = 0  # Initialize holding duration
        self.profit_reward = 10  # Example value
        self.loss_penalty = 2  # Example value
        self.buy_reward = 1  # Example value



        # Action space: Buy, Sell, Hold
        self.action_space = spaces.Discrete(3)

        num_features = len(data.columns) - 1  # Exclude 'Date'
        self.observation_space = spaces.Box(
            low=-np.inf,
            high=np.inf,
            shape=(num_features + 3,),  # Add 2 for balance and shares, EMA
            dtype=np.float32
        )



        # Initial portfolio
        self.balance = 10_00_000  # Starting cash
        self.shares = 0       # Number of shares owned
        self.total_value = self.balance

        self.position = None  # No position initially
        self.buy_price = 0  # Track buy price


        # Set render mode
        self.render_mode = render_mode

    def reset(self):
        self.current_step = 0
        self.balance = 10_00_000
        self.shares = 0
        self.total_value = self.balance
        self.history = []  # Initialize log history
        self.position = None  # Reset position
        self.holding_duration = 0  # Reset holding duration
        return self._next_observation()


    def _next_observation(self):
        # Ensure that we're correctly handling the data and balance/shares/EMA
        stock_data = self.data.iloc[self.current_step].drop(['Date']).values.astype(np.float32)  # Stock data (1D array)

        # Balance, shares, and EMA should be reshaped to 1D arrays
        balance = np.array([self.balance], dtype=np.float32).reshape(-1)  # Balance (1D array)
        shares = np.array([self.shares], dtype=np.float32).reshape(-1)    # Shares (1D array)
        ema = np.array([self.data.iloc[self.current_step]['EMA']], dtype=np.float32).reshape(-1)  # EMA (1D array)

        # Ensure all arrays have the same shape before hstack
        obs = np.hstack([stock_data, balance, shares, ema])
        return obs






    def render(self):
        if self.render_mode == 'human':
            # Display the environment to the human (e.g., graphical window)
            pass
        elif self.render_mode == 'rgb_array':
            # Return an array of the environment's current state
            pass
        else:
            raise ValueError(f"Unsupported render mode: {self.render_mode}")



    def step(self, action):
        reward = 0
        done = False
        info = {}

        # Get current price
        self.current_price = float(self.data.iloc[self.current_step]['Close'])


        # Action: Buy
        if action == 0 and self.position != 'Bought' and float(self.balance) >= float(self.current_price):
            self.shares += int(self.balance // self.current_price)
            self.balance -= self.shares * self.current_price
            self.position = 'Bought'
            self.buy_price = self.current_price  # Record the buy price
            self.holding_duration = 0  # Reset holding duration
            reward += self.buy_reward  # Reward for buying

        # Action: Sell
        elif action == 1 and self.shares > 0 :
            profit_or_loss = (float(self.current_price) - float(self.buy_price)) * self.shares
            self.balance += self.shares * self.current_price
            self.shares = 0
            self.position = "None"  # Reset position after selling
            self.holding_duration = 0  # Reset holding duration
            # profit_or_loss = profit_or_loss.item() if isinstance(profit_or_loss, pd.Series) else profit_or_loss

            if profit_or_loss > 0:  # Loss
                reward += self.profit_reward * profit_or_loss
            else:  # Profit
                reward -= self.loss_penalty * abs(profit_or_loss)

            # self.buy_price = 0  # Clear buy price

        # Action: Hold
        elif action == 2 :
            self.holding_duration += 1
            reward -= 0.01  # Apply a small penalty for holding
            if self.holding_duration > self.max_hold_duration:
                reward -= self.holding_penalty  # Apply holding penalty

        # Check stopping conditions
        if self.balance <= 0:  # No money left
            done = True
            reward -= 30  # Large penalty for running out of money
        elif self.balance >= 1.3 * 10_00_000:  # Balance has doubled
            done = True
            reward += 60  # Large reward for achieving the goal

        # Move to the next state
        self.current_step += 1
        if self.current_step >= self.max_steps:
            done = True

        # Define the next state
        next_state = self._next_observation()  # Update or calculate the next state based on the action

        # Log additional info if needed
        info = {
            'balance': self.balance,
            'shares': self.shares,
            'total_value': self.balance + (self.shares * self.current_price)
        }

        return next_state, reward, done, info






In [15]:
!pip install 'shimmy>=2.0'



In [57]:
import numpy as np
from stable_baselines3 import DQN
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback
import itertools

# # Define the EarlyStoppingCallback class
# class EarlyStoppingCallback(BaseCallback):
#     def __init__(self, patience=10, min_improvement=0.01, verbose=0):
#         super().__init__(verbose)
#         self.patience = patience
#         self.min_improvement = min_improvement
#         self.best_reward = -np.inf
#         self.patience_counter = 0

#     def _on_step(self) -> bool:
#         # Get the current episode reward from the environment
#         rewards = self.locals['rewards']
#         if len(rewards) > 0:
#             avg_reward = np.mean(rewards)
#             print(f"Average reward: {avg_reward}")

#             # Check if the reward has improved
#             if avg_reward > self.best_reward + self.min_improvement:
#                 self.best_reward = avg_reward
#                 self.patience_counter = 0
#             else:
#                 self.patience_counter += 1

#             # If the reward hasn't improved for `patience` episodes, stop training
#             if self.patience_counter >= self.patience:
#                 print("Early stopping triggered.")
#                 return False  # Stop training
#         return True  # Continue training

# # Hyperparameter space
# params = {
#     'learning_rate': [0.0001, 0.001, 0.01],
#     'gamma': [0.9, 0.95, 0.99],
#     'batch_size': [32, 64, 128],
#     'exploration_fraction': [0.8, 0.9],
#     'exploration_final_eps': [0.2, 0.05, 0.01],
#     'target_update_interval': [1000, 5000, 10000]
# }

# # Wrap your custom environment
# env = StockTradingEnv(stock_data, render_mode='human')
# env = make_vec_env(lambda: env, n_envs=1)

# # Function to train and evaluate the model with early stopping
# def train_and_evaluate(learning_rate, gamma, batch_size, exploration_fraction, exploration_final_eps, target_update_interval):
#     print(f"Training with lr={learning_rate}, gamma={gamma}, batch_size={batch_size}, exploration_fraction={exploration_fraction}, exploration_final_eps={exploration_final_eps}, target_update_interval={target_update_interval}")

#     model = DQN(
#         "MlpPolicy",
#         env,
#         learning_rate=learning_rate,
#         gamma=gamma,
#         batch_size=batch_size,
#         exploration_fraction=exploration_fraction,
#         exploration_final_eps=exploration_final_eps,
#         exploration_initial_eps=1.0,  # Start with full exploration
#         target_update_interval=target_update_interval,
#         verbose=1,
#     )

#     # Create the early stopping callback
#     early_stopping_callback = EarlyStoppingCallback(patience=10, min_improvement=0.01, verbose=1)

#     # Train the model with the callback
#     model.learn(total_timesteps=1_00_000, callback=early_stopping_callback)

#     model.save(f"stock_dqn_model_lr_{learning_rate}_gamma_{gamma}_batch_{batch_size}_exploration_{exploration_fraction}_final_eps_{exploration_final_eps}_target_{target_update_interval}")
#     print("Model saved")

# # Grid search for hyperparameter combinations
# param_combinations = list(itertools.product(
#     params['learning_rate'],
#     params['gamma'],
#     params['batch_size'],
#     params['exploration_fraction'],
#     params['exploration_final_eps'],
#     params['target_update_interval']
# ))

# # Train and evaluate for each combination
# for combination in param_combinations:
#     lr, gamma, batch_size, exploration_fraction, exploration_final_eps, target_update_interval = combination
#     train_and_evaluate(lr, gamma, batch_size, exploration_fraction, exploration_final_eps, target_update_interval)


# EarlyStoppingCallback for DQN
class EarlyStoppingCallback(BaseCallback):
    def __init__(self, patience=10, min_improvement=0.01, verbose=0):
        super().__init__(verbose)
        self.patience = patience
        self.min_improvement = min_improvement
        self.best_reward = -np.inf
        self.patience_counter = 0

    def _on_step(self) -> bool:
        rewards = self.locals['rewards']
        if len(rewards) > 0:
            avg_reward = np.mean(rewards)
            print(f"Average reward: {avg_reward}")

            if avg_reward > self.best_reward + self.min_improvement:
                self.best_reward = avg_reward
                self.patience_counter = 0
            else:
                self.patience_counter += 1

            if self.patience_counter >= self.patience:
                print("Early stopping triggered.")
                return False  # Stop training
        return True  # Continue training

# Hyperparameter space for grid search
params = {
    'learning_rate': [0.0001, 0.001, 0.01],
    'gamma': [0.9, 0.95, 0.99],
    'batch_size': [32, 64, 128],
    'exploration_fraction': [0.8, 0.9],
    'exploration_final_eps': [0.2, 0.05, 0.01],
    'target_update_interval': [1000, 5000, 10000]
}

# Wrap your custom environment
env = StockTradingEnv(stock_data, render_mode='human')
env = make_vec_env(lambda: env, n_envs=1)

# Grid search and training for different hyperparameters
param_combinations = list(itertools.product(
    params['learning_rate'],
    params['gamma'],
    params['batch_size'],
    params['exploration_fraction'],
    params['exploration_final_eps'],
    params['target_update_interval']
))

for combination in param_combinations:
    lr, gamma, batch_size, exploration_fraction, exploration_final_eps, target_update_interval = combination
    print(f"Training with lr={lr}, gamma={gamma}, batch_size={batch_size}, exploration_fraction={exploration_fraction}, exploration_final_eps={exploration_final_eps}, target_update_interval={target_update_interval}")

    model = DQN(
        "MlpPolicy",
        env,
        learning_rate=lr,
        gamma=gamma,
        batch_size=batch_size,
        exploration_fraction=exploration_fraction,
        exploration_final_eps=exploration_final_eps,
        exploration_initial_eps=1.0,  # Start with full exploration
        target_update_interval=target_update_interval,
        verbose=1
    )

    model.learn(total_timesteps=100_000, callback=EarlyStoppingCallback(patience=5))


Training with lr=0.0001, gamma=0.9, batch_size=32, exploration_fraction=0.8, exploration_final_eps=0.2, target_update_interval=1000
Using cpu device
Average reward: -29.0
Average reward: -29.0
Average reward: -0.009999999776482582
Average reward: -0.009999999776482582
Average reward: -0.009999999776482582
Average reward: -0.009999999776482582
Average reward: -0.009999999776482582
Average reward: -0.009999999776482582
Early stopping triggered.
Training with lr=0.0001, gamma=0.9, batch_size=32, exploration_fraction=0.8, exploration_final_eps=0.2, target_update_interval=5000
Using cpu device
Average reward: 0.0
Average reward: 0.0
Average reward: -0.009999999776482582
Average reward: 0.0
Average reward: 0.0
Average reward: -0.009999999776482582
Early stopping triggered.
Training with lr=0.0001, gamma=0.9, batch_size=32, exploration_fraction=0.8, exploration_final_eps=0.2, target_update_interval=10000
Using cpu device
Average reward: -29.0
Average reward: -29.0
Average reward: 0.0
Average 

  stock_data = self.data.iloc[self.current_step].drop(['Date']).values.astype(np.float32)  # Stock data (1D array)
  self.current_price = float(self.data.iloc[self.current_step]['Close'])


Average reward: -0.009999999776482582
Early stopping triggered.
Training with lr=0.0001, gamma=0.9, batch_size=32, exploration_fraction=0.8, exploration_final_eps=0.05, target_update_interval=1000
Using cpu device
Average reward: 0.0
Average reward: 0.0
Average reward: -29.0
Average reward: -0.009999999776482582
Average reward: -29.0
Average reward: -29.0
Early stopping triggered.
Training with lr=0.0001, gamma=0.9, batch_size=32, exploration_fraction=0.8, exploration_final_eps=0.05, target_update_interval=5000
Using cpu device
Average reward: -29.0
Average reward: 0.0
Average reward: -29.0
Average reward: -0.009999999776482582
Average reward: -29.0
Average reward: -29.0
----------------------------------
| rollout/            |          |
|    ep_len_mean      | 1.5      |
|    ep_rew_mean      | -29      |
|    exploration_rate | 1        |
| time/               |          |
|    episodes         | 4        |
|    fps              | 204      |
|    time_elapsed     | 0        |
|    

In [58]:
# Reset environment
obs = env.reset()
done = False

# Simulate environment and collect actions
actions = []  # To store actions and rewards
while not done:
    # Use the VecEnv observation directly
    action, _states = model.predict(obs, deterministic=False)  # Set deterministic to False for exploration
    action = action.item()  # Convert action to a scalar if it's a numpy array or list

    # Wrap the action in a list for VecEnv compatibility
    action = [action]  # Wrap the action in a list

    # Take a step in the environment
    obs, reward, done, info = env.step(action)

    print(f"Action: {action}, Reward: {reward}, Done: {done}")

    actions.append({"action": action, "reward": reward})



Action: [1], Reward: [0.], Done: [False]
Action: [2], Reward: [-0.01], Done: [False]
Action: [0], Reward: [-29.], Done: [ True]


  stock_data = self.data.iloc[self.current_step].drop(['Date']).values.astype(np.float32)  # Stock data (1D array)
  self.current_price = float(self.data.iloc[self.current_step]['Close'])
