## Setup Environment
Python Environment: Use an environment with Python and necessary libraries installed (e.g., numpy, pandas, matplotlib for data manipulation and visualization; TensorFlow or PyTorch for neural network modeling).

In [180]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import pandas as pd



## prepare data

In [181]:
# Assuming 'stock_data' is a DataFrame containing the stock data
file_path = 'TSLA_stock_data_2023.csv'
stock_data = pd.read_csv(file_path)


In [182]:
# Normalize the data using min-max scaling

# Convert date column to datetime if it exists
if 'Date' in stock_data.columns:
    stock_data['Date'] = pd.to_datetime(stock_data['Date'])
    stock_data['Year'] = stock_data['Date'].dt.year
    stock_data['Month'] = stock_data['Date'].dt.month
    stock_data['Day'] = stock_data['Date'].dt.day
    # Optionally, drop the original date column if no longer needed
    # data.drop('Date', axis=1, inplace=True)

# If there are categorical columns, consider converting them to a one-hot encoded format
if 'CategoryColumn' in stock_data.columns:
    # This is an example; replace 'CategoryColumn' with the name of your actual column
    dummies = pd.get_dummies(stock_data['CategoryColumn'], prefix='Category')
    stock_data = pd.concat([stock_data, dummies], axis=1)
    stock_data.drop('CategoryColumn', axis=1, inplace=True)



In [183]:
# Select only the numeric columns for normalization
numeric_cols = stock_data.select_dtypes(include=['float64', 'int64']).columns
stock_data[numeric_cols] = (stock_data[numeric_cols] - stock_data[numeric_cols].min()) / (stock_data[numeric_cols].max() - stock_data[numeric_cols].min())

stock_data.fillna(method='ffill', inplace=True)  # forward fill to propagate last valid observation forward

## Define State and Reward
State Definition: Define the state as a vector of features like the day's opening price, high, low, close, and volume.
Reward Calculation: Calculate rewards based on the change in stock price, as described in the paper.


In [184]:
def prepare_state(data):
    # Assuming 'data' is a DataFrame with columns for open, high, low, close, volume
    # Normalizing data
    max_vals = data.max()
    min_vals = data.min()
    state_vector = (data - min_vals) / (max_vals - min_vals)
    return state_vector.to_numpy()


In [185]:
def calculate_reward(close_prices):
    # Assuming 'close_prices' is a list or array of closing prices
    rewards = np.diff(close_prices) / close_prices[:-1]  # Percentage change between consecutive days
    return rewards


## Reinforcement Learning Model
Model Initialization: Initialize the parameters for the TD(0) algorithm, including the discount factor (γ) and learning rate (α).
Network Setup: Set up a neural network for function approximation. A simple multi-layer perceptron (MLP) can be used initially.

In [186]:
# Define the neural network for the agent
def create_model(input_dim):
    model = Sequential([
        Dense(64, activation='relu', input_shape=(input_dim,)),
        Dense(32, activation='relu'),
        Dense(3, activation='linear')  # Assuming three actions: buy, hold, sell
    ])
    model.compile(optimizer='adam', loss='mse')
    return model

# Initialize the model
num_features = 5  # e.g., open, high, low, close, volume
model = create_model(num_features)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [187]:
# For each step within the episode, decides an action, executes it, and then observes the outcome.
def take_action(state, action, data, t):
    # This function should define how to take an action
    # For simplicity, we're not really trading but simulating action effects
    next_state = data.iloc[t + 1]
    reward = calculate_reward(state['close'], next_state['close'])
    return next_state, reward

def calculate_reward(current_price, next_price):
    return (next_price - current_price) / current_price  # Percentage change

def update_model(model, state, action, reward, next_state):
    # Perform a TD update on the model
    target = reward + 0.95 * np.amax(model.predict(np.array([next_state]))[0])  # Discount factor gamma = 0.95
    target_vec = model.predict(np.array([state]))[0]
    target_vec[action] = target
    model.fit(np.array([state]), np.array([target_vec]), epochs=1, verbose=0)


In [188]:
#Simulate each trading period as an episode. For each episode, reset the environment to an initial state

def run_episode(data, model):
    total_reward = 0
    state = get_initial_state(data)

    for t in range(len(data) - 1):
        action = choose_action(state, model)
        next_state, reward = take_action(state, action, data, t)
        update_model(model, state, action, reward, next_state)
        state = next_state
        total_reward += reward
    return total_reward

def get_initial_state(data):
    # Normalize data and prepare the initial state
    return data.iloc[0]

def choose_action(state, model):
    if isinstance(state, pd.Series):
        state = state.values 
    # state = state.astype('float32')
    # Use the model to predict the action from the current state
    state = np.reshape(state, (1, -1))
    q_values = model.predict(state)
    return np.argmax(q_values[0])  # Choosing the action with the highest Q-value




In [189]:
# Run multiple episodes to train the model effectively.

def train_model(data, model, episodes):
    for e in range(episodes):
        total_reward = run_episode(data, model)
        print(f'Episode {e+1}/{episodes}, Total Reward: {total_reward}')
# Initialize the model
print(model.input_shape) 
num_features = len(stock_data[numeric_cols].columns)  # Update to match the number of input features after preprocessing
model = create_model(num_features)
train_model(stock_data[numeric_cols], model, 10000)
print(stock_data.columns) 

(None, 5)
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step


KeyError: 'close'

## Training the Model
Algorithm: Implement the TD(0) learning algorithm to update the value function based on the state and reward observed from the data.
Iteration: Iterate over episodes (each episode can be a sequence of stock price data), updating the model with each step.

## Evaluation
Testing: After training, test the model on unseen data to assess its predictive accuracy.
Performance Metrics: Use metrics like RMSE or predictive accuracy grades as used in the paper to evaluate performance.

## Monitoring and Adjustment
Continuous Monitoring: Set up scripts to monitor the model’s performance over time.
Adjustment: Tune parameters and refine the model as needed based on performance metrics.