## 1. Setting up the Trading Environment

In [4]:
# %pip install nsepy

In [5]:
# Define state space and action space
n_features = 5  # Number of features in the state space
action_space = [0, 1, 2]  # 0: Hold, 1: Buy Straddle, 2: Sell Straddle

# Define reward function
def calculate_reward_buy_straddle(curr_data, position, portfolio_value):
    underlying_price = curr_data['Close']
    call_price = curr_data['Call Option Price']
    put_price = curr_data['Put Option Price']

    if position == 0:
        cost = call_price + put_price
        if portfolio_value >= cost:
            reward = 0  # No reward for opening a position
        else:
            reward = -100  # Penalty for insufficient funds
    else:
        reward = -10  # Penalty for invalid action

    return reward

def calculate_reward_sell_straddle(curr_data, position):
    underlying_price = curr_data['Close']
    call_price = curr_data['Call Option Price']
    put_price = curr_data['Put Option Price']

    if position == 0:
        premium = call_price + put_price
        reward = premium  # Reward for selling the straddle
    else:
        reward = -10  # Penalty for invalid action

    return reward

## Setting up the Trading Environment

In [None]:
import gym
from gym import spaces
import numpy as np

class StraddleEnvironment(gym.Env):
    def __init__(self, n_features, n_steps, initial_capital=10000):
        super(StraddleEnvironment, self).__init__()
        # Define action and observation spaces
        self.action_space = spaces.Discrete(3)  # 0: hold, 1: buy straddle, 2: sell straddle
        self.observation_space = spaces.Box(low=0, high=np.inf, shape=(n_features,), dtype=np.float32)

        # Initialize other variables
        self.initial_capital = initial_capital
        self.current_capital = initial_capital
        self.current_step = 0
        self.max_steps = n_steps  # Define the number of steps in each episode

    def reset(self):
        # Reset the environment to the initial state
        self.current_capital = self.initial_capital
        self.current_step = 0
        self.state = self.get_observation()
        return self.state

    def step(self, action):
        # Execute one step within the environment
        reward = 0
        done = False
        info = {}

        # Implement action logic
        if action == 1:
            # Buy straddle
            # Execute buy logic
            reward = calculate_reward_buy_straddle()
        elif action == 2:
            # Sell straddle
            # Execute sell logic
            reward = calculate_reward_sell_straddle()

        # Update state
        self.current_step += 1
        self.state = self.get_observation()

        # Check if the episode is done
        if self.current_step >= self.max_steps:
            done = True

        return self.state, reward, done, info

    def get_observation(self):
        # Get the current observation/state
        # You need to define the structure of the observation data
        # based on the features you want to include
        observation_data = np.random.uniform(low=0, high=1, size=(self.observation_space.shape))
        return observation_data

    def render(self, mode='human'):
        # Render the environment (optional)
        pass

# Instantiate the environment with the required parameters
n_features = 5  # Number of features in the state space
n_steps = 100  # Number of steps in each episode
env = StraddleEnvironment(n_features, n_steps)

## Data Gathering and Preprocessing

In [None]:
from nsepy import get_history
from datetime import date
import pandas as pd

# Collect historical data
start_date = date(2023, 1, 1)
end_date = date(2023, 12, 31)
underlying_data = get_history(symbol='SBIN', start=start_date, end=end_date, index=True)

# Preprocessing (e.g., handling missing values, outliers, feature engineering)
def preprocess_data(data):
    # Example preprocessing steps
    data = data.fillna(method='ffill')  # Fill missing values with forward-fill
    data['SMA'] = data['Close'].rolling(window=20).mean()  # Calculate simple moving average
    data['RSI'] = calculate_rsi(data['Close'], 14)  # Calculate RSI
    return data

# Function for calculating RSI
def calculate_rsi(prices, window):
    deltas = np.diff(prices)
    seed = deltas[:window]
    up = seed[seed >= 0].sum() / window
    down = -seed[seed < 0].sum() / window
    rs = up / down
    rsi = np.zeros_like(prices)
    rsi[:window] = 100 - 100 / (1 + rs)

    for i in range(window, len(prices)):
        delta = deltas[i - 1]
        if delta > 0:
            upval = delta
            downval = 0
        else:
            upval = 0
            downval = -delta

        up = (up * (window - 1) + upval) / window
        down = (down * (window - 1) + downval) / window
        rs = up / down
        rsi[i] = 100 - 100 / (1 + rs)

    return rsi

# Preprocess the data
preprocessed_data = preprocess_data(underlying_data)

## Reinforcement Learning Model Design

In [None]:
import numpy as np
from collections import deque


# Define Q-learning algorithm
class QLearningAgent:
    def __init__(self, state_size, action_size, learning_rate=0.1, discount_factor=0.99, epsilon=1.0, epsilon_decay=0.995, epsilon_min=0.01):
        self.state_size = state_size
        self.action_size = action_size
        self.learning_rate = learning_rate
        self.discount_factor = discount_factor
        self.epsilon = epsilon
        self.epsilon_decay = epsilon_decay
        self.epsilon_min = epsilon_min
        self.q_table = np.zeros((state_size, action_size))
        self.memory = deque(maxlen=2000)

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_size)
        else:
            return np.argmax(self.q_table[state])

    def learn(self, state, action, reward, next_state, done):
        q_value = self.q_table[state, action]
        next_max_q = np.max(self.q_table[next_state])
        new_q = reward + self.discount_factor * next_max_q * (1 - done)
        self.q_table[state, action] += self.learning_rate * (new_q - q_value)
        self.epsilon = max(self.epsilon_decay * self.epsilon, self.epsilon_min)

    def train(self, env, episodes):
        for episode in range(episodes):
            state = env.reset()
            done = False
            while not done:
                action = self.act(state)
                next_state, reward, done, _ = env.step(action)
                self.learn(state, action, reward, next_state, done)
                state = next_state

# Create the Q-learning agent
state_size = env.observation_space.shape[0]
action_size = len(action_space)
agent = QLearningAgent(state_size, action_size)

# Train the agent
episodes = 1000
agent.train(env, episodes)

## Training the Agent

In [None]:
# Training the Q-learning agent
def train_agent():
    # Train the agent using historical data
    for episode in range(num_episodes):
        state = env.reset()
        done = False
        while not done:
            action = agent.act(state)
            next_state, reward, done, info = env.step(action)
            
            # Calculate the reward based on the action
            if action == 1:  # Buy Straddle
                reward = calculate_reward_buy_straddle(env.data.iloc[env.current_step], env.position, env.portfolio_value)
            elif action == 2:  # Sell Straddle
                reward = calculate_reward_sell_straddle(env.data.iloc[env.current_step], env.position)
            
            agent.learn(state, action, reward, next_state, done)
            state = next_state

# Set the number of episodes
num_episodes = 1000

# Train the agent
train_agent()