**Before you run this file**, make sure you changed **file_loc** into your local directory for FAANG_data.


**This is where we processed data cleansing:**


We drop NA, make sure all entries are possitive (sanity check).

We convert entries into convenient units.

We filter data so that<br> 1. 0.8 < Moneyness < 1.2 <br> 2. spread/mid_Price < 0.5 (mid_price = (bid + ask)/2) <br>3. Expiration date > 1

In [11]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import datetime
from scipy.stats import norm
from sklearn.metrics import mean_squared_error as mse

file_loc = "/Users/zouruiqi2020/Option-Price-Prediction/Final/Data Collection/FAANG_data/"

faang_df = pd.read_csv(file_loc + "FAANG_data.csv")

# drop empty values
faang_df_clean = faang_df.dropna(subset=[
    'lastPrice', 'impliedVolatility', 'strike', 'bid', 'ask', 'openInterest'
])
# clean unreasonable data
faang_df_clean = faang_df_clean[
    (faang_df_clean['lastPrice'] > 0) &
    (faang_df_clean['impliedVolatility'] > 0) &
    (faang_df_clean['bid'] >= 0) &
    (faang_df_clean['ask'] >= 0) &
    (faang_df_clean['openInterest'] > 0)
]
faang_df_clean['midPrice'] = (faang_df_clean['bid'] + faang_df_clean['ask']) / 2
spread = faang_df_clean['ask'] - faang_df_clean['bid']
faang_df_clean['ask_bid_spread']=spread
faang_df_clean = faang_df_clean[spread / faang_df_clean['midPrice'] < 0.5]

# Time to maturity in years
faang_df_clean['expiration'] = pd.to_datetime(faang_df_clean['expirationDate'])
faang_df_clean['days_to_exp'] = (faang_df_clean['expiration'] - datetime.today()).dt.days
faang_df_clean = faang_df_clean[faang_df_clean['days_to_exp'] > 0]
faang_df_clean['T'] = faang_df_clean['days_to_exp'] / 365

# Fetch spot prices for each ticker (if needed)
spot_prices = {ticker: yf.Ticker(ticker).history(period='1d')['Close'].iloc[-1]
            for ticker in faang_df_clean['ticker'].unique()}

faang_df_clean['spot'] = faang_df_clean['ticker'].map(spot_prices)

# Moneyness
faang_df_clean['moneyness'] = faang_df_clean['spot'] / faang_df_clean['strike']


faang_df_clean = faang_df_clean[
    (faang_df_clean['moneyness'] > 0.8) & (faang_df_clean['moneyness'] < 1.2) &
    (faang_df_clean['T'] > 1/365)  # > 1 day to maturity
]
faang_df_clean['log_moneyness']=np.log(faang_df_clean['moneyness'] )


#Black-Schole Price

def black_scholes_price(S, K, T, r, sigma, option_type):
    if T <= 0 or sigma <= 0 or S <= 0 or K <= 0:
        return np.nan  # skip invalid inputs

    d1 = (np.log(S / K) + (r + 0.5 * sigma**2) * T) / (sigma * np.sqrt(T))
    d2 = d1 - sigma * np.sqrt(T)

    if option_type == 'call':
        return S * norm.cdf(d1) - K * np.exp(-r * T) * norm.cdf(d2)
    elif option_type == 'put':
        return K * np.exp(-r * T) * norm.cdf(-d2) - S * norm.cdf(-d1)
    else:
        return np.nan
risk_free_rate = 0.0433
faang_df_clean['bs_price'] = faang_df_clean.apply(
    lambda row: black_scholes_price(
        S=row['spot'],
        K=row['strike'],
        T=row['T'],
        r=risk_free_rate,
        sigma=row['impliedVolatility'],
        option_type=row['optionType']
    ),
    axis=1
)

print(faang_df_clean.sample(10))

# Save to CSV (overwrite if file already exists)
faang_df_clean.to_csv(file_loc + "FAANG_data_processed.csv", index=False)


            contractSymbol              lastTradeDate  strike  lastPrice  \
5026   AAPL271217C00235000  2025-06-23 16:04:15+00:00   235.0      28.22   
7772   META260116C00720000  2025-06-23 19:40:32+00:00   720.0      68.95   
9677   NFLX250703C01212500  2025-06-23 13:50:34+00:00  1212.5      37.83   
12557  NFLX260618C01220000  2025-06-18 17:24:33+00:00  1220.0     208.00   
4516   AAPL260618P00220000  2025-06-23 13:55:02+00:00   220.0      28.90   
2024   AMZN251017C00260000  2025-06-23 19:51:00+00:00   260.0       1.55   
3318   AAPL250711P00185000  2025-06-23 19:59:44+00:00   185.0       0.55   
3643   AAPL250815P00250000  2025-06-23 19:56:04+00:00   250.0      48.40   
11656  NFLX251219C01220000  2025-06-23 14:47:24+00:00  1220.0     155.15   
10500  NFLX250725P01135000  2025-06-23 16:09:41+00:00  1135.0      22.00   

          bid     ask    change  percentChange  volume  openInterest  ...  \
5026    27.65   29.95  1.469999       5.495325     1.0         376.0  ...   
7772    6