In [1]:
# Vol Surface Constructor

# 1/ Collect Data from YF
# 2/ Process and clean data
# 3/ Calculate IV
# 4/ Contsruct Vol Surface

In [2]:
import yfinance as yf
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns
import plotly.express as px
from sklearn.neighbors import NearestNeighbors
from scipy.stats import zscore



In [3]:
# Define the ticker symbol (e.g., SPY)
ticker = 'SPY'

# Get the data for the ticker
stock_data = yf.Ticker(ticker)

# Fetch expiration dates
expiration_dates = stock_data.options

# Initialize an empty list to store the option data
option_data = []

# Loop through all expiration dates and collect the data
for exp in expiration_dates:
    # Fetch the option chain for each expiration date
    option_chain = stock_data.option_chain(exp)
    
    # Extract calls and puts DataFrames
    calls = option_chain.calls
    puts = option_chain.puts
    
    # Add expiration date and option type (call/put) columns to both DataFrames
    calls['expiration_date'] = exp
    calls['type'] = 'call'
    puts['expiration_date'] = exp
    puts['type'] = 'put'
    
    # Append both calls and puts to the option_data list
    option_data.append(calls)
    option_data.append(puts)

# Combine the data for all expiration dates into a single DataFrame
options_df = pd.DataFrame(pd.concat(option_data, ignore_index=True))

# Check the resulting DataFrame
options_df


Unnamed: 0,contractSymbol,lastTradeDate,strike,lastPrice,bid,ask,change,percentChange,volume,openInterest,impliedVolatility,inTheMoney,contractSize,currency,expiration_date,type
0,SPY241231C00350000,2024-12-31 14:30:08+00:00,350.0,238.13,236.80,238.32,-4.769989,-1.963767,9.0,14,4.685551,True,REGULAR,USD,2024-12-31,call
1,SPY241231C00355000,2024-12-31 15:44:33+00:00,355.0,233.98,232.25,233.42,-10.840012,-4.427748,12.0,42,3.898438,True,REGULAR,USD,2024-12-31,call
2,SPY241231C00360000,2024-12-19 20:43:08+00:00,360.0,229.00,227.33,228.50,0.000000,0.000000,3.0,0,3.957031,True,REGULAR,USD,2024-12-31,call
3,SPY241231C00365000,2024-12-19 20:46:26+00:00,365.0,223.51,222.16,224.23,0.000000,0.000000,98.0,0,4.220708,True,REGULAR,USD,2024-12-31,call
4,SPY241231C00370000,2024-12-20 17:37:39+00:00,370.0,225.52,217.05,218.24,0.000000,0.000000,8.0,1,2.812503,True,REGULAR,USD,2024-12-31,call
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7687,SPY270115P00745000,2024-12-26 19:33:13+00:00,745.0,143.80,154.74,159.86,0.000000,0.000000,1.0,2,0.106866,True,REGULAR,USD,2027-01-15,put
7688,SPY270115P00750000,2024-12-24 17:18:36+00:00,750.0,150.02,159.74,164.86,0.000000,0.000000,,0,0.109155,True,REGULAR,USD,2027-01-15,put
7689,SPY270115P00870000,2024-10-10 13:30:05+00:00,870.0,292.17,269.50,274.29,0.000000,0.000000,,0,0.000010,True,REGULAR,USD,2027-01-15,put
7690,SPY270115P00900000,2024-12-16 14:51:59+00:00,900.0,294.67,310.07,314.86,0.000000,0.000000,1.0,0,0.168099,True,REGULAR,USD,2027-01-15,put


In [4]:
# Data Cleaning

# Convert expiration date to datetime
options_df["expiration_date"] = pd.to_datetime(options_df["expiration_date"])

# remove rows with 'null' values
options_df = options_df[~options_df.isnull().any(axis=1)]

# drop duplicates
options_df = options_df.drop_duplicates(subset="contractSymbol")

# drop columns
options_df.drop(columns=["lastTradeDate","change","percentChange","inTheMoney","contractSize","currency"],inplace=True, errors="ignore")

# Normalise
current_price = yf.Ticker(ticker).history(period="1d").Close.iloc[-1]
options_df["normalised_price"] = options_df["strike"] / current_price
options_df = options_df[(options_df["normalised_price"] <= 1.2) & (options_df["normalised_price"] >= 0.8)]

# time to maturity (+ filtering out long date options)
options_df["time_to_maturity"] = (options_df.expiration_date - pd.to_datetime("today")).dt.days
options_df = options_df[(options_df["time_to_maturity"] <= 180) & (options_df["time_to_maturity"] > 0)]

# Filter out thinly traded options (low open interest)
options_df = options_df[options_df.openInterest > 1000] # may need to change this -> this would need to change for diff tickers

options_df


Unnamed: 0,contractSymbol,strike,lastPrice,bid,ask,volume,openInterest,impliedVolatility,expiration_date,type,normalised_price,time_to_maturity
524,SPY250102C00580000,580.0,8.51,8.46,8.55,34.0,1139,0.164925,2025-01-02,call,0.987049,1
529,SPY250102C00585000,585.0,4.29,4.42,4.46,516.0,1384,0.139657,2025-01-02,call,0.995558,1
530,SPY250102C00586000,586.0,3.82,3.77,3.81,396.0,1083,0.137704,2025-01-02,call,0.997260,1
531,SPY250102C00587000,587.0,3.27,3.25,3.28,1870.0,1486,0.139047,2025-01-02,call,0.998962,1
533,SPY250102C00589000,589.0,2.16,2.16,2.18,10760.0,1500,0.132150,2025-01-02,call,1.002366,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5134,SPY250630P00568000,568.0,14.79,14.96,15.04,1.0,1371,0.144952,2025-06-30,put,0.966628,180
5138,SPY250630P00580000,580.0,17.03,18.08,18.15,39.0,2325,0.132424,2025-06-30,put,0.987049,180
5139,SPY250630P00585000,585.0,19.58,19.49,19.56,3.0,1478,0.126382,2025-06-30,put,0.995558,180
5140,SPY250630P00590000,590.0,20.15,21.08,21.16,22.0,1170,0.120446,2025-06-30,put,1.004067,180


In [5]:
# Outlier Detection for Call Options

calls_df = options_df[options_df['type'] == 'call']

# Extract the relevant features (normalized_price, time_to_maturity, and impliedVolatility)
X = calls_df[['normalised_price', 'time_to_maturity']].values
IV = calls_df['impliedVolatility'].values

# Step 1: Fit a KNN model to the feature data
k = 5  # Number of nearest neighbors to consider
knn = NearestNeighbors(n_neighbors=k)
knn.fit(X)

# Step 2: Find the k nearest neighbors for each point
distances, indices = knn.kneighbors(X)

# Step 3: Calculate the average IV of the neighbors
avg_IV_neighbors = []
for i in range(len(calls_df)):
    # Get the IVs of the k nearest neighbors
    neighbor_IVs = IV[indices[i]]
    avg_IV_neighbors.append(np.mean(neighbor_IVs))

# Step 4: Compute the difference between the IV of each point and its neighbors' average IV
iv_diff = np.abs(IV - avg_IV_neighbors)

# Step 5: Define a threshold for outliers (e.g., 2 standard deviations away from the mean IV difference)
threshold = 3 * np.std(iv_diff)

# Step 6: Flag outliers
outliers = iv_diff > threshold

# Step 7: Add the 'Outlier' column to the DataFrame
calls_df['Outlier'] = outliers

calls_df = calls_df[calls_df.Outlier == False]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  calls_df['Outlier'] = outliers


In [6]:
# Outlier Detection for Put Options

puts_df = options_df[options_df['type'] == 'put']

# Extract the relevant features (normalized_price, time_to_maturity, and impliedVolatility)
X = puts_df[['normalised_price', 'time_to_maturity']].values
IV = puts_df['impliedVolatility'].values

# Step 1: Fit a KNN model to the feature data
k = 5  # Number of nearest neighbors to consider
knn = NearestNeighbors(n_neighbors=k)
knn.fit(X)

# Step 2: Find the k nearest neighbors for each point
distances, indices = knn.kneighbors(X)

# Step 3: Calculate the average IV of the neighbors
avg_IV_neighbors = []
for i in range(len(puts_df)):
    # Get the IVs of the k nearest neighbors
    neighbor_IVs = IV[indices[i]]
    avg_IV_neighbors.append(np.mean(neighbor_IVs))

# Step 4: Compute the difference between the IV of each point and its neighbors' average IV
iv_diff = np.abs(IV - avg_IV_neighbors)

# Step 5: Define a threshold for outliers (e.g., 2 standard deviations away from the mean IV difference)
threshold = 3 * np.std(iv_diff)

# Step 6: Flag outliers
outliers = iv_diff > threshold

# Step 7: Add the 'Outlier' column to the DataFrame
puts_df['Outlier'] = outliers

puts_df = puts_df[puts_df.Outlier == False]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  puts_df['Outlier'] = outliers


In [8]:

# Plot for calls
fig_calls = px.scatter_3d(
    calls_df,
    x="normalised_price", 
    y="time_to_maturity",   
    z="impliedVolatility",  
    color="impliedVolatility",  
    title="Call Options - Implied Volatility Surface",
)

fig_calls.update_traces(marker=dict(size=3))  # Adjust marker size
fig_calls.show()


# Plot for puts
fig_puts = px.scatter_3d(
    puts_df,
    x="normalised_price",  # x-axis (strike price)
    y="time_to_maturity",   # y-axis (time to maturity)
    z="impliedVolatility",  # z-axis (IV)
    color="impliedVolatility",  # Optional: Color by expiration date
    title="Put Options - Implied Volatility Surface"
)
fig_puts.update_traces(marker=dict(size=3))
fig_puts.show()



In [None]:
import numpy as np
import pandas as pd
from scipy.interpolate import griddata
import plotly.graph_objects as go
from scipy.ndimage import gaussian_filter

# Assume options_df is your DataFrame with columns: 'strike', 'time_to_maturity', and 'IV'

# Create a grid of strike prices and times to maturity (for interpolation)
strike_grid = np.linspace(calls_df['normalised_price'].min(), calls_df['normalised_price'].max(), 50)
maturity_grid = np.linspace(calls_df['time_to_maturity'].min(), calls_df['time_to_maturity'].max(), 50)

# Create a meshgrid of strike price and time to maturity
X, Y = np.meshgrid(strike_grid, maturity_grid)

# Perform cubic spline interpolation to get implied volatilities for the grid points
Z = griddata(
    (calls_df['normalised_price'], calls_df['time_to_maturity']),
    calls_df['impliedVolatility'],
    (X, Y),
    method='cubic'
)

Z_smooth = gaussian_filter(Z, sigma=1.5)

# Create the 3D surface plot using plotly.graph_objects
fig = go.Figure(data=[go.Surface(z=Z_smooth, x=X, y=Y)])

# Add title and labels
fig.update_layout(
    title="Implied Volatility Surface (Cubic Spline)",
    scene=dict(
        xaxis_title='Strike Price',
        yaxis_title='Time to Maturity',
        zaxis_title='Implied Volatility'
    )
)

# Show the plot
fig.show()





In [None]:
from scipy.stats import zscore

options_df['Z-Score'] = zscore(options_df['impliedVolatility'])
options_df
# options_df_cleaned = options_df[options_df['Z-Score'].abs() < 3]


In [None]:
import plotly.express as px
import numpy as np
import pandas as pd
from scipy.stats import zscore

# Sample DataFrame (replace with your data)
data = {
    'strike': [600, 610, 620, 630, 640, 650],
    'time_to_maturity': [30, 60, 90, 120, 150, 180],
    'impliedVolatility': [0.2, 0.25, 0.18, 0.22, 0.3, 0.28]
}
options_df = pd.DataFrame(data)

# Calculate Z-scores for implied volatility
options_df['Z-Score'] = zscore(options_df['impliedVolatility'])

# Create hover text including Z-score
options_df['hover_text'] = options_df.apply(lambda row: f"Strike: {row['strike']}, Maturity: {row['time_to_maturity']} days, IV: {row['impliedVolatility']*100}%, Z-Score: {row['Z-Score']:.2f}", axis=1)

# Plot the 3D scatter plot with hover text
fig = px.scatter_3d(options_df, x='strike', y='time_to_maturity', z='impliedVolatility',
                    hover_name='hover_text', title="3D Scatter Plot with Z-Scores")

# Show the plot
fig.show()


In [11]:
# Notes

# how to deal with outliers in my scatter plot