In [48]:
import pandas as pd
import numpy as np
import yfinance as yf

def pull_stock_data(ticker, period='6mo', interval='1d'):
    stock = yf.Ticker(ticker)
    df = stock.history(period=period, interval=interval)
    df = df[['Close']].rename(columns={'Close': 'close'})
    df.index = pd.to_datetime(df.index)
    return df

def pull_vix_data(period='6mo', interval='1d'):
    vix = yf.Ticker('^VIX')
    df = vix.history(period=period, interval=interval)
    df = df[['Close']].rename(columns={'Close': 'vix_close'})
    df.index = pd.to_datetime(df.index)
    return df

def calculate_realized_volatility(df, window=20):
    df['log_return'] = np.log(df['close'] / df['close'].shift(1))
    df['rolling_std'] = df['log_return'].rolling(window=window, min_periods=window).std()
    df['realized_volatility'] = df['rolling_std'] * np.sqrt(252)
    df = df.dropna(subset=['realized_volatility']).copy()
    return df

def generate_stock_price_prediction_dataset(
    df,
    vix_df,
    days_to_predict=5,
    lookback_days=10  # how many past closes you use
):
    features = []
    labels = []
    
    vix_df = vix_df[['vix_close']]  # Ensure only relevant column is used
    df = df.merge(vix_df, left_index=True, right_index=True, how='inner')
    df = df.dropna()  # Drop rows where VIX or volatility missing

    trading_days = df.index

    for i in range(lookback_days, len(df) - days_to_predict):
        buy_date = trading_days[i]
        S_buy = df.loc[buy_date, 'close']
        realized_vol = df.loc[buy_date, 'realized_volatility']
        vix_value = df.loc[buy_date, 'vix_close']

        # Get the next 5 closes
        S_predict_array = df.loc[trading_days[i+1:i+1+days_to_predict], 'close'].values

        # Skip if not enough future data
        if len(S_predict_array) < days_to_predict:
            continue

        if np.isnan(S_buy) or np.isnan(realized_vol) or np.isnan(vix_value) or np.any(np.isnan(S_predict_array)):
            continue

        past_closes = df.loc[trading_days[i - lookback_days:i - 1], 'close'].values

        feature_row = {
            'buy_date': buy_date,
            'current_stock_price': S_buy,
            'realized_volatility': realized_vol,
            'vix_value': vix_value,
        }

        # Add past close prices
        for j in range(lookback_days-1):
            feature_row[f'close_lag_{lookback_days-j}'] = past_closes[j]

        features.append(feature_row)
        labels.append(S_predict_array)  # target is now an array of 5 prices!

    feature_df = pd.DataFrame(features)
    label_array = np.array(labels)  # Numpy array: shape (n_samples, 5)

    return feature_df, label_array


# Full pipeline
def create_stock_price_prediction_dataset(ticker, days_to_predict=5):
    df = pull_stock_data(ticker, period='6mo', interval='1d')
    vix_df = pull_vix_data(period='6mo', interval='1d')
    df = calculate_realized_volatility(df)

    df.index = df.index.tz_localize(None)
    vix_df.index = vix_df.index.tz_localize(None)

    print(df.head())
    print(vix_df.head())
    feature_df, label_series = generate_stock_price_prediction_dataset(
        df,
        vix_df,
        days_to_predict=days_to_predict
    )
    return feature_df, label_series

# Example usage
ticker = "AAPL"
features, labels = create_stock_price_prediction_dataset(ticker)
print(features.head())


                 close  log_return  rolling_std  realized_volatility
Date                                                                
2024-11-25  232.614243    0.012966     0.010808             0.171575
2024-11-26  234.801834    0.009360     0.011011             0.174787
2024-11-27  234.671982   -0.000553     0.010375             0.164701
2024-11-29  237.069351    0.010164     0.009480             0.150496
2024-12-02  239.326859    0.009478     0.008818             0.139986
            vix_close
Date                 
2024-10-25  20.330000
2024-10-28  19.799999
2024-10-29  19.340000
2024-10-30  20.350000
2024-10-31  23.160000
    buy_date  current_stock_price  realized_volatility  vix_value  \
0 2024-12-10           247.497879             0.116297      14.18   
1 2024-12-11           246.219284             0.120618      13.58   
2 2024-12-12           247.687683             0.120655      13.92   
3 2024-12-13           247.857483             0.116774      13.81   
4 2024-12-16      

In [47]:
labels

array([[246.21928406, 247.68768311, 247.85748291, 250.76428223,
        253.20159912],
       [247.68768311, 247.85748291, 250.76428223, 253.20159912,
        247.77757263],
       [247.85748291, 250.76428223, 253.20159912, 247.77757263,
        249.51565552],
       [250.76428223, 253.20159912, 247.77757263, 249.51565552,
        254.21051025],
       [253.20159912, 247.77757263, 249.51565552, 254.21051025,
        254.98965454],
       [247.77757263, 249.51565552, 254.21051025, 254.98965454,
        257.91644287],
       [249.51565552, 254.21051025, 254.98965454, 257.91644287,
        258.73550415],
       [254.21051025, 254.98965454, 257.91644287, 258.73550415,
        255.30929565],
       [254.98965454, 257.91644287, 258.73550415, 255.30929565,
        251.92301941],
       [257.91644287, 258.73550415, 255.30929565, 251.92301941,
        250.14497375],
       [258.73550415, 255.30929565, 251.92301941, 250.14497375,
        243.5821991 ],
       [255.30929565, 251.92301941, 250.144

In [51]:
import tensorflow as tf

def create_lstm_dataset(features_df, labels_array, batch_size=32, shuffle_buffer=1000):
    """
    Create a TensorFlow Dataset for LSTM input.
    
    Args:
        features_df (pd.DataFrame): Feature dataframe (n_samples, n_features).
        labels_array (np.ndarray): Label array (n_samples, 5).
        batch_size (int): Batch size for training.
        shuffle_buffer (int): Buffer size for shuffling.

    Returns:
        tf.data.Dataset: Dataset yielding (features, labels) tuples.
    """
    if len(features_df) != len(labels_array):
        raise ValueError("Features and labels must have the same number of samples.")
    
    # Take list of non-numeric columns
    non_numeric_columns = features_df.select_dtypes(exclude=[np.number]).columns.tolist()

    # Drop non-numeric columns
    features_df = features_df.drop(columns=non_numeric_columns)

    features = features_df.to_numpy().astype('float32')
    labels = labels_array.astype('float32')

    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    dataset = dataset.shuffle(buffer_size=shuffle_buffer)
    dataset = dataset.batch(batch_size).prefetch(tf.data.AUTOTUNE)

    return dataset


dataset = create_lstm_dataset(features, labels)
dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(None, 12), dtype=tf.float32, name=None), TensorSpec(shape=(None, 5), dtype=tf.float32, name=None))>

In [56]:
# Train LSTM Model:

def create_lstm_model(input_shape, output_shape):
    model = tf.keras.Sequential([
        tf.keras.layers.LSTM(64, return_sequences=True, input_shape=input_shape),
        tf.keras.layers.LSTM(32),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(output_shape)
    ])

    model.compile(optimizer='adam', loss='mse')
    return model

def train_lstm_model(dataset, input_shape, output_shape, epochs=100):
    model = create_lstm_model(input_shape, output_shape)
    model.fit(dataset, epochs=epochs)
    return model

input_shape = (features.shape[1], 1)  # Number of features, 1 timestep
output_shape = labels.shape[1]  # Number of days to predict
lstm_model = train_lstm_model(dataset, input_shape, output_shape, epochs=120)

Epoch 1/120
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 13ms/step - loss: 52975.1172
Epoch 2/120
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 52422.9219
Epoch 3/120
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 52333.5156
Epoch 4/120
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - loss: 51859.5312 
Epoch 5/120
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - loss: 52031.1172
Epoch 6/120
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 51781.3828
Epoch 7/120
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 52094.1758
Epoch 8/120
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 51890.2344
Epoch 9/120
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 51856.1133
Epoch 10/120
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0