In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import torch

## Data Cleaning & Preprocessing

In [96]:
# Load the data into a DataFrame
dateformat_str = "%Y-%m-%d %H:%M"
data = pd.read_csv('data/tsla_2019_2022.csv', na_values=[' '], skipinitialspace=True, date_format=dateformat_str, parse_dates=True) # some missing values in the dataset are given as an empty space

# Remove leading and trailing whitespace and square brackets from column names
data.columns = data.columns.str.strip('data.columns.str.strip(" []")')

# Display the column names
print("Column names:", data.columns)

print("Shape of data:", data.shape)

Column names: Index(['QUOTE_UNIXTIME', 'QUOTE_READTIME', 'QUOTE_DATE', 'QUOTE_TIME_HOURS',
       'UNDERLYING_LAST', 'EXPIRE_DATE', 'EXPIRE_UNIX', 'DTE', 'C_DELTA',
       'C_GAMMA', 'C_VEGA', 'C_THETA', 'C_RHO', 'C_IV', 'C_VOLUME', 'C_LAST',
       'C_SIZE', 'C_BID', 'C_ASK', 'STRIKE', 'P_BID', 'P_ASK', 'P_SIZE',
       'P_LAST', 'P_DELTA', 'P_GAMMA', 'P_VEGA', 'P_THETA', 'P_RHO', 'P_IV',
       'P_VOLUME', 'STRIKE_DISTANCE', 'STRIKE_DISTANCE_PCT'],
      dtype='object')
Shape of data: (2659259, 33)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2659259 entries, 0 to 2659258
Data columns (total 33 columns):
 #   Column               Dtype  
---  ------               -----  
 0   QUOTE_UNIXTIME       int64  
 1   QUOTE_READTIME       object 
 2   QUOTE_DATE           object 
 3   QUOTE_TIME_HOURS     float64
 4   UNDERLYING_LAST      float64
 5   EXPIRE_DATE          object 
 6   EXPIRE_UNIX          int64  
 7   DTE                  float64
 8   C_DELTA              float64
 9   C_

In [98]:
# Convert cols with date/time to datetime data type
data["QUOTE_READTIME"] = pd.to_datetime(data["QUOTE_READTIME"], format="%Y-%m-%d %H:%M", exact=True)
data["QUOTE_DATE"] = pd.to_datetime(data["QUOTE_DATE"], format="%Y-%m-%d", exact=True)
data["EXPIRE_DATE"] = pd.to_datetime(data["EXPIRE_DATE"], format="%Y-%m-%d", exact=True)
# can use .dt.tz_localize(tz="US/Eastern") to set to Eastern time then convert to local timezone if we want to

print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2659259 entries, 0 to 2659258
Data columns (total 33 columns):
 #   Column               Dtype         
---  ------               -----         
 0   QUOTE_UNIXTIME       int64         
 1   QUOTE_READTIME       datetime64[ns]
 2   QUOTE_DATE           datetime64[ns]
 3   QUOTE_TIME_HOURS     float64       
 4   UNDERLYING_LAST      float64       
 5   EXPIRE_DATE          datetime64[ns]
 6   EXPIRE_UNIX          int64         
 7   DTE                  float64       
 8   C_DELTA              float64       
 9   C_GAMMA              float64       
 10  C_VEGA               float64       
 11  C_THETA              float64       
 12  C_RHO                float64       
 13  C_IV                 float64       
 14  C_VOLUME             float64       
 15  C_LAST               float64       
 16  C_SIZE               object        
 17  C_BID                float64       
 18  C_ASK                float64       
 19  STRIKE               

In [100]:
print(data[data.isnull().any(axis=1)].shape)
# 897680/2659259 = 0.337 of rows have some missing value in some cols, so we shouldn't drop them since its a large portion of the dataset

(897680, 33)


In [101]:
# Display the first few rows of the DataFrame to ensure it loaded correctly
data.head()

Unnamed: 0,QUOTE_UNIXTIME,QUOTE_READTIME,QUOTE_DATE,QUOTE_TIME_HOURS,UNDERLYING_LAST,EXPIRE_DATE,EXPIRE_UNIX,DTE,C_DELTA,C_GAMMA,...,P_LAST,P_DELTA,P_GAMMA,P_VEGA,P_THETA,P_RHO,P_IV,P_VOLUME,STRIKE_DISTANCE,STRIKE_DISTANCE_PCT
0,1556740800,2019-05-01 16:00:00,2019-05-01,16.0,233.98,2019-05-03,1556913600,2.0,0.98465,0.00055,...,0.01,-0.00071,3e-05,0.00046,-0.00975,0.0,2.22548,147.0,104.0,0.444
1,1556740800,2019-05-01 16:00:00,2019-05-01,16.0,233.98,2019-05-03,1556913600,2.0,0.98371,0.00067,...,0.02,-0.00109,9e-05,0.00058,-0.0101,-1e-05,2.08349,12.0,99.0,0.423
2,1556740800,2019-05-01 16:00:00,2019-05-01,16.0,233.98,2019-05-03,1556913600,2.0,0.98458,0.00069,...,0.02,-0.00122,0.00012,0.00113,-0.01435,0.0,2.02359,15.0,94.0,0.402
3,1556740800,2019-05-01 16:00:00,2019-05-01,16.0,233.98,2019-05-03,1556913600,2.0,0.99187,0.00049,...,0.01,-0.00134,9e-05,0.00139,-0.01465,0.0,1.89504,0.0,89.0,0.38
4,1556740800,2019-05-01 16:00:00,2019-05-01,16.0,233.98,2019-05-03,1556913600,2.0,0.99341,0.00039,...,0.01,-0.00176,8e-05,0.00105,-0.01467,0.0,1.76812,91.0,84.0,0.359


In [105]:
# Split the data into training and testing sets (assuming the data is already sorted by date)
split_index = len(data) // 2
train_data = data.iloc[:split_index]
test_data = data.iloc[split_index:]

# Verify the shape of the training and testing sets
print("Training data shape:", train_data.shape)
print("Testing data shape:", test_data.shape)

Training data shape: (1329629, 33)
Testing data shape: (1329630, 33)


Now, I want to parse for ATM strikes. To do this, I will only be looking at options where the delta is between 45-55, or -45 to -55.

In [104]:
# Select relevant columns
selected_columns = ['QUOTE_READTIME', 'DTE', 'C_DELTA', 'C_IV', 'STRIKE', 'P_DELTA', 'P_IV']

# Filter data to include only ATM options (where delta is closest to 0.5)
# Ensure column names match exactly what's in the DataFrame
atm_options = data[(data['C_DELTA'] >= 0.45) & (data['C_DELTA'] <= 0.55) & (data['P_DELTA'] >= -0.55) & (data['P_DELTA'] <= -0.45)]

# Display the first few rows of the filtered DataFrame
print("Number of rows in atm_options:", len(atm_options))
atm_options.head()


Number of rows in atm_options: 119965


Unnamed: 0,QUOTE_UNIXTIME,QUOTE_READTIME,QUOTE_DATE,QUOTE_TIME_HOURS,UNDERLYING_LAST,EXPIRE_DATE,EXPIRE_UNIX,DTE,C_DELTA,C_GAMMA,...,P_LAST,P_DELTA,P_GAMMA,P_VEGA,P_THETA,P_RHO,P_IV,P_VOLUME,STRIKE_DISTANCE,STRIKE_DISTANCE_PCT
27,1556740800,2019-05-01 16:00:00,2019-05-01,16.0,233.98,2019-05-03,1556913600,2.0,0.46754,0.04011,...,4.15,-0.53568,0.04294,0.08063,-0.74779,-0.00943,0.45695,,1.0,0.004
98,1556740800,2019-05-01 16:00:00,2019-05-01,16.0,233.98,2019-05-10,1557518400,9.0,0.54695,0.01962,...,6.89,-0.45314,0.02036,0.15112,-0.40764,-0.02923,0.51013,903.0,1.5,0.006
99,1556740800,2019-05-01 16:00:00,2019-05-01,16.0,233.98,2019-05-10,1557518400,9.0,0.49653,0.02035,...,7.9,-0.50511,0.02124,0.15272,-0.39408,-0.03208,0.49025,,1.0,0.004
181,1556740800,2019-05-01 16:00:00,2019-05-01,16.0,233.98,2019-05-17,1558123200,16.0,0.50681,0.01542,...,10.36,-0.49447,0.01591,0.19924,-0.30326,-0.05516,0.50007,,1.0,0.004
275,1556740800,2019-05-01 16:00:00,2019-05-01,16.0,233.98,2019-05-24,1558728000,23.0,0.546,0.01265,...,10.75,-0.45426,0.01311,0.23667,-0.25555,-0.07316,0.50871,101.0,1.5,0.006


### Data Normalization

In [107]:
from sklearn.preprocessing import MinMaxScaler

In [109]:
selected_features = ['DTE', 'C_DELTA', 'C_IV', 'P_DELTA', 'P_IV']

print("Shape of atm_options:", atm_options.shape)
print("Selected features:", selected_features)
print("Number of rows in atm_options after filtering:", len(atm_options))

# Normalize the data using MinMaxScaler
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(atm_options[selected_features])

# Organize the data into sequences with a variable length
X = []
y = []
sequence_length = 10

for i in range(len(scaled_data) - sequence_length):
    X.append(scaled_data[i:i+sequence_length])
    y.append(scaled_data[i+sequence_length])

X = np.array(X)
y = np.array(y)

# Print the shape of X and y to verify
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of atm_options: (119965, 33)
Selected features: ['DTE', 'C_DELTA', 'C_IV', 'P_DELTA', 'P_IV']
Number of rows in atm_options after filtering: 119965
Shape of X: (119955, 10, 5)
Shape of y: (119955, 5)
