To start, we will load the data in a Pandas dataframe:

In [1]:
import pandas as pd

# Load the data into a DataFrame
data = pd.read_csv('tsla_2019_2022.csv')

# Remove leading and trailing whitespace from column names
data.columns = data.columns.str.strip()

# Display the column names
print("Column names:", data.columns)

# Display the first few rows of the DataFrame to ensure it loaded correctly
print(data.head())

# Split the data into training and testing sets (assuming the data is already sorted by date)
split_index = len(data) // 2
train_data = data.iloc[:split_index]
test_data = data.iloc[split_index:]

# Verify the shape of the training and testing sets
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)


  data = pd.read_csv('tsla_2019_2022.csv')


Column names: Index(['[QUOTE_UNIXTIME]', '[QUOTE_READTIME]', '[QUOTE_DATE]',
       '[QUOTE_TIME_HOURS]', '[UNDERLYING_LAST]', '[EXPIRE_DATE]',
       '[EXPIRE_UNIX]', '[DTE]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]',
       '[C_THETA]', '[C_RHO]', '[C_IV]', '[C_VOLUME]', '[C_LAST]', '[C_SIZE]',
       '[C_BID]', '[C_ASK]', '[STRIKE]', '[P_BID]', '[P_ASK]', '[P_SIZE]',
       '[P_LAST]', '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]', '[P_THETA]',
       '[P_RHO]', '[P_IV]', '[P_VOLUME]', '[STRIKE_DISTANCE]',
       '[STRIKE_DISTANCE_PCT]'],
      dtype='object')
   [QUOTE_UNIXTIME]   [QUOTE_READTIME] [QUOTE_DATE]  [QUOTE_TIME_HOURS]  \
0        1556740800   2019-05-01 16:00   2019-05-01                16.0   
1        1556740800   2019-05-01 16:00   2019-05-01                16.0   
2        1556740800   2019-05-01 16:00   2019-05-01                16.0   
3        1556740800   2019-05-01 16:00   2019-05-01                16.0   
4        1556740800   2019-05-01 16:00   2019-05-01                1

Now, I want to parse for ATM strikes. To do this, I will only be looking at options where the delta is between 45-55, or -45 to -55. 

In [18]:
# Convert '[C_DELTA]' and '[P_DELTA]' columns to numeric data types
data['[C_DELTA]'] = pd.to_numeric(data['[C_DELTA]'], errors='coerce')
data['[P_DELTA]'] = pd.to_numeric(data['[P_DELTA]'], errors='coerce')

# Select relevant columns
selected_columns = ['QUOTE_READTIME', 'DTE', '[C_DELTA]', '[C_IV]', 'STRIKE', '[P_DELTA]', '[P_IV]']

# Filter data to include only ATM options (where delta is closest to 0.5)
# Ensure column names match exactly what's in the DataFrame
atm_options = data[(data['[C_DELTA]'] >= 0.45) & (data['[C_DELTA]'] <= 0.55) & (data['[P_DELTA]'] >= -0.55) & (data['[P_DELTA]'] <= -0.45)]

# Display the first few rows of the filtered DataFrame
print(atm_options.head())
print("Number of rows in atm_options:", len(atm_options))

     [QUOTE_UNIXTIME]   [QUOTE_READTIME] [QUOTE_DATE]  [QUOTE_TIME_HOURS]  \
27         1556740800   2019-05-01 16:00   2019-05-01                16.0   
98         1556740800   2019-05-01 16:00   2019-05-01                16.0   
99         1556740800   2019-05-01 16:00   2019-05-01                16.0   
181        1556740800   2019-05-01 16:00   2019-05-01                16.0   
275        1556740800   2019-05-01 16:00   2019-05-01                16.0   

     [UNDERLYING_LAST] [EXPIRE_DATE]  [EXPIRE_UNIX]  [DTE]  [C_DELTA]  \
27              233.98    2019-05-03     1556913600    2.0    0.46754   
98              233.98    2019-05-10     1557518400    9.0    0.54695   
99              233.98    2019-05-10     1557518400    9.0    0.49653   
181             233.98    2019-05-17     1558123200   16.0    0.50681   
275             233.98    2019-05-24     1558728000   23.0    0.54600   

     [C_GAMMA]  ...    [P_LAST] [P_DELTA]  [P_GAMMA]   [P_VEGA]   [P_THETA]  \
27    0.040110  ...

We will now normalize the data and organize it for our training:

In [3]:
%pip install scikit-learn
import sys
sys.executable
from sklearn.preprocessing import MinMaxScaler

Note: you may need to restart the kernel to use updated packages.


You should consider upgrading via the 'c:\Users\ronje\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [19]:
# Remove square brackets from column names
atm_options.columns = atm_options.columns.str.strip('[]')

# Now you can access columns without square brackets
selected_features = ['DTE', 'C_DELTA', 'C_IV', 'P_DELTA', 'P_IV']


print("Shape of atm_options:", atm_options.shape)
print("Selected features:", selected_features)
print("Number of rows in atm_options after filtering:", len(atm_options))



# Normalize the data using MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(atm_options[selected_features])

# Organize the data into sequences with a variable length
X = []
y = []
sequence_length = 10

for i in range(len(scaled_data) - sequence_length):
    X.append(scaled_data[i:i+sequence_length])
    y.append(scaled_data[i+sequence_length])

X = np.array(X)
y = np.array(y)

# Print the shape of X and y to verify
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)


Shape of atm_options: (119965, 33)
Selected features: ['DTE', 'C_DELTA', 'C_IV', 'P_DELTA', 'P_IV']
Number of rows in atm_options after filtering: 119965
Shape of X: (119955, 10, 5)
Shape of y: (119955, 5)


We will now train a model to predict IV

In [25]:
%pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.16.1-cp310-cp310-win_amd64.whl (2.1 kB)
Collecting tensorflow-intel==2.16.1
  Downloading tensorflow_intel-2.16.1-cp310-cp310-win_amd64.whl (376.9 MB)
     -------------------------------------- 376.9/376.9 MB 2.8 MB/s eta 0:00:00
Collecting libclang>=13.0.0
  Downloading libclang-18.1.1-py2.py3-none-win_amd64.whl (26.4 MB)
     ---------------------------------------- 26.4/26.4 MB 6.1 MB/s eta 0:00:00
Collecting ml-dtypes~=0.3.1
  Downloading ml_dtypes-0.3.2-cp310-cp310-win_amd64.whl (127 kB)
     -------------------------------------- 127.8/127.8 KB 3.8 MB/s eta 0:00:00
Collecting wrapt>=1.11.0
  Downloading wrapt-1.16.0-cp310-cp310-win_amd64.whl (37 kB)
Collecting google-pasta>=0.1.1
  Downloading google_pasta-0.2.0-py3-none-any.whl (57 kB)
     ---------------------------------------- 57.5/57.5 KB ? eta 0:00:00
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1
  Downloading gast-0.5.4-py3-none-any.whl (19 kB)
Collecting astunparse>=1.6.

You should consider upgrading via the 'c:\Users\ronje\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Define the LSTM model
model = Sequential([
    LSTM(units=50, input_shape=(X.shape[1], X.shape[2])),
    Dense(units=5)  # 5 output nodes for predicting each feature
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)


# Verify column names in the testing dataset
print("Column names in test_data:", test_data.columns)

# Adjust selected_features if necessary to match column names in test_data
selected_features = ['DTE', '[C_DELTA]', '[C_IV]', '[P_DELTA]', '[P_IV]']

# Selecting the same features for X_test as used for training
X_test = test_data[selected_features].values

# Similarly, selecting the target variable (implied volatility) for y_test
y_test = test_data[['[C_IV]', '[P_IV]']].values  # Assuming you want to predict both C_IV and P_IV



# Evaluate the model on the testing set
loss = model.evaluate(X_test, y_test)
print("Test Loss:", loss)


# Make predictions on the testing set
predictions = model.predict(X_test)



Epoch 1/10


  super().__init__(**kwargs)


[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - loss: 0.0330 - val_loss: 0.0195
Epoch 2/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - loss: 0.0168 - val_loss: 0.0142
Epoch 3/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 5ms/step - loss: 0.0132 - val_loss: 0.0133
Epoch 4/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 6ms/step - loss: 0.0121 - val_loss: 0.0127
Epoch 5/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - loss: 0.0114 - val_loss: 0.0120
Epoch 6/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - loss: 0.0111 - val_loss: 0.0112
Epoch 7/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5ms/step - loss: 0.0105 - val_loss: 0.0108
Epoch 8/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 5ms/step - loss: 0.0101 - val_loss: 0.0105
Epoch 9/10
[1m2999/2999[0

KeyError: "None of [Index(['DTE', 'C_DELTA', 'C_IV', 'P_DELTA', 'P_IV'], dtype='object')] are in the [columns]"

In [23]:
import matplotlib.pyplot as plt

# Plot actual vs predicted implied volatility for each feature
for i in range(predictions.shape[1]):
    plt.figure(figsize=(10, 6))
    plt.plot(y_test[:, i], label='Actual')
    plt.plot(predictions[:, i], label='Predicted')
    plt.title(f'Implied Volatility Prediction for Feature {selected_features[i]}')
    plt.xlabel('Data Point')
    plt.ylabel('Implied Volatility')
    plt.legend()
    plt.show()


NameError: name 'X_test' is not defined

In [31]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

# Load the data into a DataFrame
data = pd.read_csv('tsla_2019_2022.csv')

# Remove leading and trailing whitespace from column names
data.columns = data.columns.str.strip()

# Convert '[C_DELTA]' and '[P_DELTA]' columns to numeric data types
data['[C_DELTA]'] = pd.to_numeric(data['[C_DELTA]'], errors='coerce')
data['[P_DELTA]'] = pd.to_numeric(data['[P_DELTA]'], errors='coerce')

# Select relevant columns
selected_columns = ['[DTE]', '[C_DELTA]', '[C_IV]', '[STRIKE]', '[P_DELTA]', '[P_IV]']

# Filter data to include only ATM options (where delta is closest to 0.5)
atm_options = data[(data['[C_DELTA]'] >= 0.45) & (data['[C_DELTA]'] <= 0.55) & (data['[P_DELTA]'] >= -0.55) & (data['[P_DELTA]'] <= -0.45)]

# Reset index of atm_options
atm_options.reset_index(drop=True, inplace=True)

# Normalize the data using MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(atm_options[selected_columns])

# Organize the data into sequences with a variable length
X = []
y = []
sequence_length = 10

for i in range(len(scaled_data) - sequence_length):
    X.append(scaled_data[i:i+sequence_length])
    y.append(scaled_data[i+sequence_length])

X = np.array(X)
y = np.array(y)

# Define the LSTM model
model = Sequential([
    LSTM(units=50, input_shape=(X.shape[1], X.shape[2])),
    Dense(units=6)  # 6 output nodes for predicting each feature
])

# Compile the model
model.compile(optimizer='adam', loss='mse')

# Train the model
model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2, verbose=1)


# Assuming you have loaded and preprocessed your test data similarly to the training data

# Define X_test and y_test using the same process as for training data
selected_features_test = ['[DTE]', '[C_DELTA]', '[C_IV]', '[STRIKE]', '[P_DELTA]', '[P_IV]']

# Selecting the same features for X_test as used for training
X_test = test_data[selected_features_test].values

# Similarly, selecting the target variable (implied volatility) for y_test
y_test = test_data[['[C_IV]', '[P_IV]']].values  # Assuming you want to predict both C_IV and P_IV

import tensorflow as tf

# Convert X_test and y_test to compatible data types
X_test = X_test.astype('float32')
y_test = y_test.astype('float32')

# Evaluate the model on the testing set
test_loss = model.evaluate(X_test, y_test)
print("Test Loss:", test_loss)

# Make predictions on the testing set
predictions = model.predict(X_test)

# Plot actual vs predicted implied volatility for each feature
for i in range(predictions.shape[1]):
    plt.figure(figsize=(10, 6))
    plt.plot(y_test[:, i], label='Actual')
    plt.plot(predictions[:, i], label='Predicted')
    plt.title(f'Implied Volatility Prediction for Feature {selected_features_test[i+2]}')
    plt.xlabel('Data Point')
    plt.ylabel('Implied Volatility')
    plt.legend()
    plt.show()


  data = pd.read_csv('tsla_2019_2022.csv')
  super().__init__(**kwargs)


Epoch 1/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 5ms/step - loss: 0.0270 - val_loss: 0.0160
Epoch 2/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5ms/step - loss: 0.0137 - val_loss: 0.0119
Epoch 3/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 5ms/step - loss: 0.0110 - val_loss: 0.0106
Epoch 4/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 5ms/step - loss: 0.0100 - val_loss: 0.0104
Epoch 5/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5ms/step - loss: 0.0095 - val_loss: 0.0099
Epoch 6/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5ms/step - loss: 0.0089 - val_loss: 0.0095
Epoch 7/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5ms/step - loss: 0.0086 - val_loss: 0.0091
Epoch 8/10
[1m2999/2999[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 5ms/step - loss: 0.0081 - val_loss: 0.0089
Epoch 9/10
[1m2

ValueError: Failed to convert a NumPy array to a Tensor (Unsupported object type float).

In [28]:
#debugging cell
print(atm_options.columns)

Index(['[QUOTE_UNIXTIME]', '[QUOTE_READTIME]', '[QUOTE_DATE]',
       '[QUOTE_TIME_HOURS]', '[UNDERLYING_LAST]', '[EXPIRE_DATE]',
       '[EXPIRE_UNIX]', '[DTE]', '[C_DELTA]', '[C_GAMMA]', '[C_VEGA]',
       '[C_THETA]', '[C_RHO]', '[C_IV]', '[C_VOLUME]', '[C_LAST]', '[C_SIZE]',
       '[C_BID]', '[C_ASK]', '[STRIKE]', '[P_BID]', '[P_ASK]', '[P_SIZE]',
       '[P_LAST]', '[P_DELTA]', '[P_GAMMA]', '[P_VEGA]', '[P_THETA]',
       '[P_RHO]', '[P_IV]', '[P_VOLUME]', '[STRIKE_DISTANCE]',
       '[STRIKE_DISTANCE_PCT]'],
      dtype='object')
