In [None]:
import yfinance as yf
import pandas as pd
import time
import numpy as np
# Define FAANG tickers and names
faang_tickers = {
    "META": "Meta (Facebook)",
    "AAPL": "Apple",
    "AMZN": "Amazon",
    "NFLX": "Netflix",
    "GOOGL": "Google"
}

# Container for all data
all_faang_options = []

# Loop through each FAANG company
for symbol, name in faang_tickers.items():
    print(f"Fetching options for {name} ({symbol})...")
    ticker = yf.Ticker(symbol)

    expirations = ticker.options

    for i, expiration in enumerate(expirations):
        try:
            option_chain = ticker.option_chain(expiration)

            # Process calls and puts
            for df, opt_type in [(option_chain.calls, 'call'), (option_chain.puts, 'put')]:
                df = df.copy()
                df['expiration'] = expiration
                df['option_type'] = opt_type
                df['ticker'] = symbol
                all_faang_options.append(df)

            print(f"  - [{i+1}/{len(expirations)}] {expiration} OK")
            time.sleep(1)

        except Exception as e:
            print(f"  - Failed for {expiration}: {e}")

# Combine all FAANG options into a single DataFrame
faang_df = pd.concat(all_faang_options, ignore_index=True)

# Show summary
print(f"\n Combined FAANG options shape: {faang_df.shape}")
print(faang_df.head())




Fetching options for Meta (Facebook) (META)...
  - [1/19] 2025-06-13 OK
  - [2/19] 2025-06-20 OK
  - [3/19] 2025-06-27 OK
  - [4/19] 2025-07-03 OK
  - [5/19] 2025-07-11 OK
  - [6/19] 2025-07-18 OK
  - [7/19] 2025-07-25 OK
  - [8/19] 2025-08-15 OK
  - [9/19] 2025-09-19 OK
  - [10/19] 2025-10-17 OK
  - [11/19] 2025-11-21 OK
  - [12/19] 2025-12-19 OK
  - [13/19] 2026-01-16 OK
  - [14/19] 2026-03-20 OK
  - [15/19] 2026-06-18 OK
  - [16/19] 2026-09-18 OK
  - [17/19] 2026-12-18 OK
  - [18/19] 2027-01-15 OK
  - [19/19] 2027-12-17 OK
Fetching options for Apple (AAPL)...
  - [1/21] 2025-06-13 OK
  - [2/21] 2025-06-20 OK
  - [3/21] 2025-06-27 OK
  - [4/21] 2025-07-03 OK
  - [5/21] 2025-07-11 OK
  - [6/21] 2025-07-18 OK
  - [7/21] 2025-07-25 OK
  - [8/21] 2025-08-15 OK
  - [9/21] 2025-09-19 OK
  - [10/21] 2025-10-17 OK
  - [11/21] 2025-11-21 OK
  - [12/21] 2025-12-19 OK
  - [13/21] 2026-01-16 OK
  - [14/21] 2026-02-20 OK
  - [15/21] 2026-03-20 OK
  - [16/21] 2026-06-18 OK
  - [17/21] 2026-09-18 O

In [2]:
faang_df.sample(10)

Unnamed: 0,contractSymbol,lastTradeDate,strike,lastPrice,bid,ask,change,percentChange,volume,openInterest,impliedVolatility,inTheMoney,contractSize,currency,expiration,option_type,ticker
5923,AAPL260320C00195000,2025-06-09 19:51:57+00:00,195.0,27.9,27.7,27.9,-2.34,-7.738096,45.0,1747.0,0.353156,True,REGULAR,USD,2026-03-20,call,AAPL
13355,GOOGL250620C00205000,2025-06-09 19:46:55+00:00,205.0,0.04,0.04,0.05,-0.01,-20.000002,77.0,7536.0,0.384772,False,REGULAR,USD,2025-06-20,call,GOOGL
7416,AMZN250919C00345000,2025-06-09 18:28:56+00:00,345.0,0.08,0.06,0.09,0.0,0.0,16.0,534.0,0.346198,False,REGULAR,USD,2025-09-19,call,AMZN
13029,NFLX270617P01260000,2025-06-05 14:12:18+00:00,1260.0,222.99,227.0,235.9,0.0,0.0,,1.0,0.311664,True,REGULAR,USD,2027-06-17,put,NFLX
13933,GOOGL251017C00180000,2025-06-09 19:45:38+00:00,180.0,12.2,12.2,12.35,1.099999,9.909904,187.0,2107.0,0.335944,False,REGULAR,USD,2025-10-17,call,GOOGL
10232,NFLX250815C00410000,2025-02-20 15:25:26+00:00,410.0,628.93,555.6,562.85,0.0,0.0,1.0,4.0,1e-05,True,REGULAR,USD,2025-08-15,call,NFLX
1223,META250718C00845000,2025-06-06 16:52:10+00:00,845.0,0.95,0.75,0.78,0.0,0.0,3.0,73.0,0.310798,False,REGULAR,USD,2025-07-18,call,META
8390,AMZN270617C00240000,2025-06-09 17:11:49+00:00,240.0,38.88,38.85,39.35,2.150002,5.85353,10.0,638.0,0.391089,False,REGULAR,USD,2027-06-17,call,AMZN
14686,GOOGL270115C00110000,2025-06-09 16:56:06+00:00,110.0,77.38,75.75,78.2,9.129997,13.377286,2.0,148.0,0.514501,True,REGULAR,USD,2027-01-15,call,GOOGL
8496,AMZN271217C00230000,2025-06-09 19:56:57+00:00,230.0,49.65,48.25,50.5,2.900002,6.203212,18.0,733.0,0.407843,False,REGULAR,USD,2027-12-17,call,AMZN


In [4]:
# drop empty values
import numpy as np
faang_df_clean = faang_df.dropna(subset=[
    'lastPrice', 'impliedVolatility', 'strike', 'bid', 'ask', 'openInterest'
])
# clean unreasonable data
faang_df_clean = faang_df_clean[
    (faang_df_clean['lastPrice'] > 0) &
    (faang_df_clean['impliedVolatility'] > 0) &
    (faang_df_clean['bid'] >= 0) &
    (faang_df_clean['ask'] >= 0) &
    (faang_df_clean['openInterest'] > 0)
]
faang_df_clean['midPrice'] = (faang_df_clean['bid'] + faang_df_clean['ask']) / 2
spread = faang_df_clean['ask'] - faang_df_clean['bid']
faang_df_clean['ask_bid_spread']=spread
faang_df_clean = faang_df_clean[spread / faang_df_clean['midPrice'] < 0.5]

from datetime import datetime

# Time to maturity in years
faang_df_clean['expiration'] = pd.to_datetime(faang_df_clean['expiration'])
faang_df_clean['days_to_exp'] = (faang_df_clean['expiration'] - datetime.today()).dt.days
faang_df_clean = faang_df_clean[faang_df_clean['days_to_exp'] > 0]
faang_df_clean['T'] = faang_df_clean['days_to_exp'] / 365

# Fetch spot prices for each ticker (if needed)
spot_prices = {ticker: yf.Ticker(ticker).history(period='1d')['Close'].iloc[-1]
               for ticker in faang_df_clean['ticker'].unique()}

faang_df_clean['spot'] = faang_df_clean['ticker'].map(spot_prices)

# Moneyness
faang_df_clean['moneyness'] = faang_df_clean['spot'] / faang_df_clean['strike']


faang_df_clean = faang_df_clean[
    (faang_df_clean['moneyness'] > 0.8) & (faang_df_clean['moneyness'] < 1.2) &
    (faang_df_clean['T'] > 1/365)  # > 1 day to maturity
]
faang_df_clean['log_moneyness']=np.log(faang_df_clean['moneyness'] )



#Black-Schole Price
import numpy as np
from scipy.stats import norm
def black_scholes_price(S, K, T, r, sigma, option_type):
    if T <= 0 or sigma <= 0 or S <= 0 or K <= 0:
        return np.nan  # skip invalid inputs

    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)

    if option_type == 'call':
        return S * norm.cdf(d1) - K * np.exp(-r * T) * norm.cdf(d2)
    elif option_type == 'put':
        return K * np.exp(-r * T) * norm.cdf(-d2) - S * norm.cdf(-d1)
    else:
        return np.nan
risk_free_rate = 0.0433
faang_df_clean['bs_price'] = faang_df_clean.apply(
    lambda row: black_scholes_price(
        S=row['spot'],
        K=row['strike'],
        T=row['T'],
        r=risk_free_rate,
        sigma=row['impliedVolatility'],
        option_type=row['option_type']
    ),
    axis=1
)

faang_df_clean.sample(10)

Unnamed: 0,contractSymbol,lastTradeDate,strike,lastPrice,bid,ask,change,percentChange,volume,openInterest,...,option_type,ticker,midPrice,ask_bid_spread,days_to_exp,T,spot,moneyness,log_moneyness,bs_price
5385,AAPL250919P00235000,2025-06-06 19:56:00+00:00,235.0,31.72,34.05,34.8,0.0,0.0,2.0,2144.0,...,put,AAPL,34.425,0.75,101,0.276712,201.449997,0.857234,-0.154044,32.259148
1600,META250815C00815000,2025-06-09 17:07:28+00:00,815.0,9.25,8.15,8.35,0.0,0.0,1.0,184.0,...,call,META,8.25,0.2,66,0.180822,694.059998,0.851607,-0.16063,8.991387
11393,NFLX251219P01200000,2025-06-09 16:05:18+00:00,1200.0,103.32,103.95,105.75,4.120003,4.153229,1.0,83.0,...,put,NFLX,104.85,1.8,192,0.526027,1224.459961,1.020383,0.020178,92.055023
7448,AMZN250919P00205000,2025-06-09 19:12:15+00:00,205.0,8.2,8.3,8.4,-1.400001,-14.583339,175.0,3540.0,...,put,AMZN,8.35,0.1,101,0.276712,216.979996,1.058439,0.056795,7.41689
13614,GOOGL250718C00210000,2025-06-09 19:54:00+00:00,210.0,0.3,0.29,0.32,0.03,11.111111,131.0,6255.0,...,call,GOOGL,0.305,0.03,38,0.10411,176.089996,0.838524,-0.176112,0.332371
2140,META250919P00610000,2025-06-09 19:09:42+00:00,610.0,16.9,17.6,17.85,-1.300001,-7.142864,114.0,533.0,...,put,META,17.725,0.25,101,0.276712,694.059998,1.137803,0.129099,15.740596
12280,NFLX260918P01100000,2025-06-05 15:15:38+00:00,1100.0,112.27,116.1,123.95,0.0,0.0,3.0,2.0,...,put,NFLX,120.025,7.85,465,1.273973,1224.459961,1.113145,0.10719,97.849546
6425,AAPL270115P00220000,2025-06-09 19:40:56+00:00,220.0,34.24,34.05,34.4,1.84,5.679013,7.0,4392.0,...,put,AAPL,34.225,0.35,584,1.6,201.449997,0.915682,-0.088086,25.135023
7137,AMZN250718C00210000,2025-06-09 19:59:40+00:00,210.0,12.75,12.55,12.65,2.17,20.510397,1728.0,14822.0,...,call,AMZN,12.6,0.1,38,0.10411,216.979996,1.033238,0.032698,13.123925
9005,NFLX250620C01205000,2025-06-09 19:44:56+00:00,1205.0,31.8,32.15,33.45,-17.39,-35.352715,7.0,291.0,...,call,NFLX,32.8,1.3,10,0.027397,1224.459961,1.016149,0.01602,33.383128


In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import norm
import yfinance as yf
from datetime import datetime
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_absolute_error as mae

# let us pick the data we want to focus on
df=faang_df_clean[(faang_df_clean['option_type']=='call')]

#df=faang_df_clean[(faang_df_clean['ticker']=='AAPL') & (faang_df_clean['option_type']=='call')]

df_train, df_test= train_test_split(df, test_size=0.2, shuffle=True,random_state=31)


In [6]:
# consider all possibilites of features combinations
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge
import itertools
features = ['bs_price', 'impliedVolatility', 'log_moneyness', 'ask_bid_spread']
n_splits=5
improved = 0
best_features = []

    
for i in range(1, len(features) + 1):
    for subset in itertools.combinations(features, i):
        mse_linear_new=0
        mse_ridge_new=0
        mse_bs=0
        mae_linear_new=0
        mae_ridge_new=0
        mae_bs=0
        
            #if mse_new < best_mse:
            #    best_mse = mse_new
            #    best_features = list(subset) 
        kfold=KFold(n_splits, shuffle=True)
        for train_index,test_index in kfold.split(df_train):
            df_tt=df_train.iloc[train_index]
            df_ho=df_train.iloc[test_index]
            
            
            linearmodel=LinearRegression()
            linearmodel.fit(df_tt[list(subset)],df_tt['lastPrice'])
            pred=linearmodel.predict(df_ho[list(subset)])

            ridgemodel=Ridge(alpha=1)
            ridgemodel.fit(df_tt[list(subset)],df_tt['lastPrice'])
            pred_ridge=ridgemodel.predict(df_ho[list(subset)])

            
            mse_linear_new+= mse(pred, df_ho['lastPrice'])
            mse_ridge_new+= mse(pred_ridge, df_ho['lastPrice'])
            mse_bs+=mse(df_ho['bs_price'],df_ho['lastPrice'])
            
            mae_linear_new+=mae(pred, df_ho['lastPrice'])
            mae_bs+=mae(df_ho['bs_price'],df_ho['lastPrice'])
            mae_ridge_new+= mae(pred_ridge, df_ho['lastPrice'])
        
        
        mse_linear_new/=n_splits
        mse_ridge_new/=n_splits
        mse_bs/=n_splits
        
        mae_linear_new/=n_splits
        mae_ridge_new/=n_splits
        mae_bs/=n_splits
        
        if (1-mse_linear_new/mse_bs+1-mae_linear_new/mae_bs)/2>improved:
            improved = (1-mse_linear_new/mse_bs+1-mae_linear_new/mae_bs)/2
            best_features = list(subset)
            linearwin=1
        if (1-mse_ridge_new/mse_bs+1-mae_ridge_new/mae_bs)/2>improved:
            improved = (1-mse_ridge_new/mse_bs+1-mae_ridge_new/mae_bs)/2
            best_features = list(subset)
            linearwin=0
            

print(best_features, 'with average improvement of', round(improved*100,2),'%')
print('original bs model has mse', mse_bs, 'and it has mae', mae_bs)
print('linear model has mse', mse_linear_new, 'and it has mae', mae_linear_new)
print('ridge regression model has mse', mse_ridge_new, 'and it has mae', mae_ridge_new)



                
print(linearwin)                



['bs_price', 'impliedVolatility', 'log_moneyness'] with average improvement of 46.15 %
original bs model has mse 151.14500437814144 and it has mae 6.126020624557827
linear model has mse 59.55201926979178 and it has mae 4.233481591127624
ridge regression model has mse 59.537743494350536 and it has mae 4.196872457513469
0


In [7]:
model=LinearRegression()
model.fit(df_train[features],df_train['lastPrice'])
predic=model.predict(df_test[features])
final=mse(predic, df_test['lastPrice'])
final


83.219304083581