# Prepare Stock Price Data

In [4]:
import numpy as np
import tensorflow as tf
from transformers import AutoTokenizer, AutoModel
from sklearn.preprocessing import StandardScaler
from tensorflow.keras import layers, models


In [5]:
# Example stock price data for 5 days, 10 years of data (mocked)
stock_prices = np.random.rand(10, 5) * 100  # 10 years, 5 days each week
textual_data = ["Apple stock trends", "Stock forecast for Apple", "Financial results of Apple", 
                "Apple market predictions", "Growth potential of Apple"] * 2  # Simple textual data

# Normalize stock price data
scaler = StandardScaler()
stock_prices = scaler.fit_transform(stock_prices)

# LSTM input needs to be reshaped
stock_prices = np.expand_dims(stock_prices, axis=-1)  # Shape (10, 5, 1)


In [6]:
def create_lstm_encoder(input_shape):
    model = models.Sequential()
    model.add(layers.LSTM(64, return_sequences=False, input_shape=input_shape))
    model.add(layers.Dense(32, activation='relu'))
    return model

# LSTM encoder for stock prices
lstm_encoder = create_lstm_encoder((5, 1))  # 5 days of stock prices

# Pass stock prices through LSTM encoder to get embeddings
stock_price_embeddings = lstm_encoder(stock_prices)


  super().__init__(**kwargs)


In [7]:
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
text_model = AutoModel.from_pretrained("bert-base-uncased")

# Tokenize textual data
inputs = tokenizer(textual_data, padding=True, truncation=True, return_tensors="pt", max_length=32)

# Get embeddings from the model
textual_embeddings = text_model(**inputs).last_hidden_state.mean(dim=1)  # Average across token embeddings


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [8]:
# Ensure that both embeddings have the same dimension before concatenation
combined_embeddings = tf.concat([stock_price_embeddings, textual_embeddings.numpy()], axis=-1)


RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [9]:
# Detach the PyTorch tensor from the computation graph before converting to numpy
textual_embeddings_numpy = textual_embeddings.detach().numpy()

# Ensure both embeddings have the same dimension before concatenation
combined_embeddings = tf.concat([stock_price_embeddings, textual_embeddings_numpy], axis=-1)


In [10]:
def create_prediction_model(embedding_size):
    model = models.Sequential()
    model.add(layers.Dense(64, activation='relu', input_shape=(embedding_size,)))
    model.add(layers.Dense(1))  # Predict next day's stock price
    return model

# Size of combined embedding
embedding_size = combined_embeddings.shape[1]

# Create prediction model
prediction_model = create_prediction_model(embedding_size)

# Compile model
prediction_model.compile(optimizer='adam', loss='mse')

# Train the model (for demonstration, we use random data for training)
prediction_model.fit(combined_embeddings, np.random.rand(10), epochs=10)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 692ms/step - loss: 0.1789
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.2686
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.1185
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step - loss: 0.0650
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step - loss: 0.1082
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - loss: 0.0957
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 0.0604
Epoch 8/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step - loss: 0.0601
Epoch 9/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - loss: 0.0854
Epoch 10/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step - loss: 0.0917


<keras.src.callbacks.history.History at 0x143181aa4e0>

In [11]:
# Prediction for next day using combined embeddings
predicted_price = prediction_model.predict(combined_embeddings)
print(predicted_price)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
[[0.35100737]
 [0.74489474]
 [0.81454253]
 [1.2383987 ]
 [0.6836066 ]
 [0.3511337 ]
 [0.74680614]
 [0.8170768 ]
 [1.2428701 ]
 [0.67568296]]


In [12]:
# Given mean and standard deviation
mean = 150
std_dev = 10

# Normalized predictions (output from the model)
normalized_predictions = [
    0.35100737, 0.74489474, 0.81454253, 1.2383987, 
    0.6836066, 0.3511337, 0.74680614, 0.8170768, 
    1.2428701, 0.67568296
]

# Denormalizing the predictions
denormalized_predictions = [value * std_dev + mean for value in normalized_predictions]

print(denormalized_predictions)


[153.5100737, 157.4489474, 158.1454253, 162.383987, 156.836066, 153.511337, 157.4680614, 158.170768, 162.428701, 156.7568296]


## Preparing Stock Price Data

In [83]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# Load the dataset
df = pd.read_csv(r'C:\Users\Tammy\Documents\GitHub\multimodal_stockprice_prediction\data\clean\baseline_transformed_dataset_LSTM.csv')

# Convert 'Date' column to datetime format if it's not already in datetime format
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')  # This will convert invalid dates to NaT

# Set the 'Date' column as the index
df.set_index('Date', inplace=True)

# Slice the DataFrame to the desired date range (check if these dates are in the DataFrame)
df = df.loc['2015-01-05':'2015-01-30']

# Normalize the prices using MinMaxScaler to scale between 0 and 1
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_prices = scaler.fit_transform(df[['AAPL(t)']])

print(scaled_prices)
# Define lookback window (e.g., 5 days)
lookback_window = 5

# Prepare data for LSTM (X = input, y = target)
X = []
y = []

# Non-overlapping window creation (step size = lookback_window)
for i in range(0, len(scaled_prices) - lookback_window, lookback_window):
    X.append(scaled_prices[i:i+lookback_window, 0])  # Use 5 days of prices
    if i + lookback_window < len(scaled_prices):
        y.append(scaled_prices[i + lookback_window, 0])  # Next day's price as target

# Convert to numpy arrays for LSTM model
X = np.array(X)
y = np.array(y)

# Reshape X to fit LSTM input shape: (samples, timesteps, features)
X = X.reshape(X.shape[0], X.shape[1], 1)  # LSTM expects 3D input

# Check the shape of X and y
print(f"Shape of X: {X.shape}")  # Should be (num_samples, 5, 1)
print(f"Shape of y: {y.shape}")  # Should be (num_samples,)

# Now X is ready for LSTM input, and y contains the corresponding targets (next day's price)


[[0.02013732]
 [0.0209124 ]
 [0.13632675]
 [0.45700961]
 [0.46630452]
 [0.2525155 ]
 [0.32765255]
 [0.29512068]
 [0.06429132]
 [0.        ]
 [0.21146247]
 [0.27575512]
 [0.49651316]
 [0.54144091]
 [0.55073383]
 [0.243995  ]
 [0.72192099]
 [1.        ]
 [0.86522141]]
Shape of X: (3, 5, 1)
Shape of y: (3,)


In [45]:
#import necessary libraries
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout


In [97]:
def build_autoregressive_encoder(input_shape, embedding_dim):
    model = Sequential()
    
    # LSTM Layer
    model.add(LSTM(units=50, return_sequences=False, input_shape=input_shape))
    
    # Optional: Dropout layer for regularization
    model.add(Dropout(0.2))
    
    # Fully connected layer to project to embedding space
    model.add(Dense(units=embedding_dim))  # Output dimension = embedding dimension
    
    return model


In [98]:
# Define lookback window (e.g., 5 days)
lookback_window = 5
embedding_dim = 10  # Dimension of the stock price embeddings

# X should be your sequence data prepared previously (5-day lookback)
X = np.array(X)

# Define the LSTM Encoder model
encoder_model = build_autoregressive_encoder(input_shape=(X.shape[1], X.shape[2]), embedding_dim=embedding_dim)

# Compile the model
encoder_model.compile(optimizer='adam', loss='mean_squared_error')

# Get the stock price embeddings
stock_price_embeddings = encoder_model.predict(X)

# Print the shape of the embeddings
print(f"Shape of stock price embeddings: {stock_price_embeddings.shape}")

  super().__init__(**kwargs)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 215ms/step
Shape of stock price embeddings: (3, 10)


## Text Data Preparation

In [None]:
def create_text_template(df, start_idx, lookback_window):
    end_idx = start_idx + lookback_window
    price_window = df.iloc[start_idx:end_idx]
    
    # Calculate statistical details
    min_price = price_window['AAPL(t)'].min()
    max_price = price_window['AAPL(t)'].max()
    avg_price = price_window['AAPL(t)'].mean()
    rate_of_change = (price_window['AAPL(t)'].iloc[-1] - price_window['AAPL(t)'].iloc[0]) / price_window['AAPL(t)'].iloc[0] * 100
    
    # Construct the textual template according to the paper
    # Here we include frequency, timestamps, min, max, moving average, and rate of change
    template = f"Context: This is a daily stock from {price_window.index[0].strftime('%Y-%m-%d')} to {price_window.index[-1].strftime('%Y-%m-%d')}.\n"
    template += f"Trend Analysis: The input data features a minimum price of {min_price:.2f}, a maximum price of {max_price:.2f}, "
    template += f"and a moving average of {avg_price:.2f}, with a change rate of {rate_of_change:.2f}%.\n"
    #add in news sentiment portion
    return template




In [None]:
# Generate textual templates for each lookback window
text_templates = []

for i in range(0, len(df) - lookback_window, lookback_window):
    text_template = create_text_template(df, i, lookback_window)
    text_templates.append(text_template)

# Example: Print the first textual template
print(text_templates[2])
#Add in headlines from the past n days.

Context: This is a daily stock from 2015-01-20 to 2015-01-26.
Trend Analysis: The input data features a minimum price of 24.18, a maximum price of 25.16, and a moving average of 24.77, with a change rate of 4.03%.



In [6]:
#set up gemini to process textual templates
from google import genai
from dotenv import load_dotenv
import os
load_dotenv()
# Load environment variables from the .env file
API_KEY = os.getenv("API_KEY")
client = genai.Client(api_key='API_KEY')#insert api key

In [None]:
# Function to get embeddings for the textual templates using Gemini
def get_text_embeddings(text_templates):
    text_embeddings = []
    for text in text_templates:
        response = client.models.embed_content(
            model='text-embedding-004',
            contents=text
        )
        
        # Print the full response to inspect the structure
        print(response)  # Inspect the response structure
        
        # Assuming 'embeddings' contains the list of embeddings
        text_embeddings.append(response.embeddings[0].values)  # Extract the first embedding
        
    return np.array(text_embeddings)

# Get textual embeddings using Gemini
text_embeddings = get_text_embeddings(text_templates)

# Check the shape and content of the text embeddings
print("Text Embeddings Shape:", text_embeddings.shape)
print("First Text Embedding:", text_embeddings[0])


embeddings=[ContentEmbedding(values=[0.021432204, 0.0034491306, -0.047928806, -0.0014327348, 0.076519735, 0.07855707, 0.018735254, 0.016345846, -0.033751275, -0.049704242, 0.0546686, 0.07639537, 0.02208628, -0.02318319, 0.07894865, -0.04472769, 0.04658384, 0.08251568, -0.110299416, -0.041863784, -0.00088288425, -0.037681717, 0.0056819455, 0.006976226, -0.011825026, -2.5338162e-05, -0.013472994, -0.006838009, 0.0064677116, -0.0439194, 0.004083298, 0.019617915, -0.0035753953, -0.04783024, 0.03045482, 0.050396923, -0.037493587, 0.015442244, -0.022750393, -0.05335065, -0.029024454, 0.011744301, -0.06472459, 0.07409858, -0.016773641, 0.0059609697, -0.014900113, -0.010763331, -0.028637035, 0.030925492, -0.010203131, 0.0013299796, -0.018675653, 0.03266812, -0.0333182, -0.04669849, -0.04119147, -0.04096307, 0.09186705, -0.0012584182, -0.018410187, -0.021066485, 0.020277837, 0.003114554, -0.011072706, -0.069106475, -0.046542104, -0.03470667, -0.06826677, -0.0021638414, -0.012863896, 0.014582863

In [None]:
# Ensure that the stock price embeddings and textual embeddings have the same number of samples
# They should align, meaning the number of price embeddings and text embeddings should be the same.

# Check the shape of your stock price embeddings and text embeddings
print(f"Stock Price Embeddings Shape: {stock_price_embeddings.shape}")
print(f"Textual Embeddings Shape: {text_embeddings.shape}")

# Concatenate the embeddings along the feature axis (axis=1)
combined_embeddings = np.concatenate((stock_price_embeddings, text_embeddings), axis=1)

# Check the shape of the combined embeddings
print(f"Combined Embeddings Shape: {combined_embeddings.shape}")


Stock Price Embeddings Shape: (3, 10)
Textual Embeddings Shape: (3, 768)
Combined Embeddings Shape: (3, 778)


In [100]:
print(combined_embeddings[0])

[ 7.25398771e-04 -6.16966048e-03  5.20680919e-02  2.46182382e-02
  9.38585028e-03  1.38711371e-03  6.59828354e-03 -1.97280105e-02
  8.36381130e-03  1.35322157e-02  2.14322040e-02  3.44913060e-03
 -4.79288060e-02 -1.43273480e-03  7.65197350e-02  7.85570700e-02
  1.87352540e-02  1.63458460e-02 -3.37512750e-02 -4.97042420e-02
  5.46686000e-02  7.63953700e-02  2.20862800e-02 -2.31831900e-02
  7.89486500e-02 -4.47276900e-02  4.65838400e-02  8.25156800e-02
 -1.10299416e-01 -4.18637840e-02 -8.82884250e-04 -3.76817170e-02
  5.68194550e-03  6.97622600e-03 -1.18250260e-02 -2.53381620e-05
 -1.34729940e-02 -6.83800900e-03  6.46771160e-03 -4.39194000e-02
  4.08329800e-03  1.96179150e-02 -3.57539530e-03 -4.78302400e-02
  3.04548200e-02  5.03969230e-02 -3.74935870e-02  1.54422440e-02
 -2.27503930e-02 -5.33506500e-02 -2.90244540e-02  1.17443010e-02
 -6.47245900e-02  7.40985800e-02 -1.67736410e-02  5.96096970e-03
 -1.49001130e-02 -1.07633310e-02 -2.86370350e-02  3.09254920e-02
 -1.02031310e-02  1.32997

In [None]:
#prompt the LLM to return embeddings?????????

In [80]:
import json
from google.genai import types
import numpy as np

# Assuming you already have the combined embeddings
# Define a function to construct a prompt from the combined embeddings
def construct_llm_prompt(embedding):
    # Convert the embedding to a text-based format
    # You could, for example, transform the embedding into a series of values
    embedding_text = ", ".join([str(value) for value in embedding])  # Convert each embedding value to a string
    
    # Create the prompt for Gemini
    prompt = f"Given the following 5-day stock price embeddings: {embedding_text}, predict the next day's stock price embedding."
    return prompt

# Define the prediction function using Gemini
def gemini_predict(prompt):
    try:
        # Generate content from the model
        response = client.models.generate_content(
            model='gemini-2.0-flash',  # Or use your specific model
            contents=prompt,
            config=types.GenerateContentConfig(
                system_instruction="Predict the next token based on the historical data.",  # You can fine-tune this instruction
                max_output_tokens=60,  # Assuming you're predicting just one token (the 6th day embedding)
                temperature=0.5,      # Adjust flexibility
                top_k=5,              # Limit to top 5 choices
                top_p=0.7,            # Consider tokens covering 70% probability mass
                response_mime_type='application/json',
                stop_sequences=['}']  # No premature stop
            )
        )
        
        response_text = response.text.strip()
        
        # Try to fix incomplete JSON by appending a missing closing brace if needed
        if not response_text.endswith('}'):
            response_text += '}'
        
        # Parse JSON response to extract predicted embedding (assuming response is in JSON format)
        try:
            response_json = json.loads(response_text)
            predicted_embedding = response_json.get("predicted_embedding", None)
            if predicted_embedding is None:
                print("Predicted embedding not found.")
                return None
            return np.array(predicted_embedding)  # Convert to numpy array for further use
        
        except json.JSONDecodeError:
            print(f"Error parsing response JSON: {response_text}")
            return None
    
    except Exception as e:
        print(f"API Error: {e}")
        return None

# Example usage:

# For each combined embedding (e.g., a 5-day window), create a prompt and get the prediction
predicted_embeddings = []

for embedding in combined_embeddings:
    prompt = construct_llm_prompt(embedding)
    predicted_embedding = gemini_predict(prompt)  # Get the predicted embedding from Gemini
    
    if predicted_embedding is not None:
        predicted_embeddings.append(predicted_embedding)

# Convert the predicted embeddings back to stock prices using the MinMaxScaler's inverse transform
predicted_prices = scaler.inverse_transform(np.array(predicted_embeddings))

# Output the predicted stock prices for the next day (6th day)
print("Predicted stock prices:", predicted_prices)


Error parsing response JSON: [0.006336182]}
Error parsing response JSON: [0.003919447]}
Error parsing response JSON: [0.012603521]}
Error parsing response JSON: [0.004591953]}
Error parsing response JSON: [0.008591235]}
Error parsing response JSON: [0.01545449]}
Error parsing response JSON: [0.020736814]}


KeyboardInterrupt: 

In [None]:
# Assuming you have a combined embeddings array for the patches
# This array will be your input to the LLM model

def predict_next_embedding(combined_embeddings):
    # Simulate predicting the next embedding from the LLM
    predicted_embeddings = []
    
    for embedding in combined_embeddings:
        # Make sure the embedding is correctly formatted for the LLM input
        # This could be a call to an API like GPT, or any other LLM you're using
        # For simplicity, we'll assume a placeholder LLM function
        
        # LLM (Gemini, GPT-3, etc.) would predict the next embedding here
        # Here we're just appending a placeholder prediction
        predicted_embedding = llm_predict(embedding)  # Replace with actual LLM call
        
        predicted_embeddings.append(predicted_embedding)
    
    return np.array(predicted_embeddings)

# Predict the next embeddings (6th day stock price)
predicted_embeddings = predict_next_embedding(combined_embeddings)

# Convert the predicted embeddings back to stock prices
predicted_prices = scaler.inverse_transform(predicted_embeddings)

# Output the predicted stock prices for the 6th day
print(predicted_prices)



In [None]:
#Account for news on the weekends
#Rolling window