In [12]:
# If needed, install dependencies, check requirements.txt file


In [13]:
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ta

In [14]:
# Ticker List:
tickers = ["AAPL","MSFT","AMZN","GOOGL","NVDA","TSLA","JPM","WMT",
           "DAL","UAL","LMT","RTX","NOC","XOM"]

In [15]:
# Price Data
data = yf.download(tickers, start="2015-01-01", end="2025-01-01")

  data = yf.download(tickers, start="2015-01-01", end="2025-01-01")
[*********************100%***********************]  14 of 14 completed


In [16]:
# Multi Index
data_flat = data.stack(level=1).reset_index()

  data_flat = data.stack(level=1).reset_index()


In [17]:
# Colum Naming
data_flat.rename(columns={
    "level_1": "Ticker",
    "Adj Close": "AdjClose",
    "Close": "Close",
    "Open": "Open",
    "High": "High",
    "Low": "Low",
    "Volume": "Volume"
}, inplace=True)

print(data_flat.columns)  # Check
data_flat.head()

Index(['Date', 'Ticker', 'Close', 'High', 'Low', 'Open', 'Volume'], dtype='object', name='Price')


Price,Date,Ticker,Close,High,Low,Open,Volume
0,2015-01-02,AAPL,24.237551,24.70532,23.7986,24.694235,212818400
1,2015-01-02,AMZN,15.426,15.7375,15.348,15.629,55664000
2,2015-01-02,DAL,43.064991,43.791788,42.653429,43.712978,8637300
3,2015-01-02,GOOGL,26.296135,26.606494,26.213205,26.447589,26480000
4,2015-01-02,JPM,46.720936,47.072332,46.406919,46.489162,12600000


In [18]:
# Feature Build
# Daily Return 
data_flat["Return"] = data_flat.groupby("Ticker")["Close"].pct_change() # Close Price for Return

# Rolling 10D Vol
data_flat["RollingVol"] = (
    data_flat.groupby("Ticker")["Return"]
    .rolling(window=10)
    .std()
    .reset_index(0, drop=True)
)

# RSI
data_flat["RSI"] = data_flat.groupby("Ticker")["Close"].transform(
    lambda x: ta.momentum.rsi(x, window=14)
)

# Moving Avg
data_flat["SMA_20"] = data_flat.groupby("Ticker")["Close"].transform(lambda x: x.rolling(20).mean())
data_flat["SMA_50"] = data_flat.groupby("Ticker")["Close"].transform(lambda x: x.rolling(50).mean())

# Volume Anomaly
data_flat["Volume_Z"] = ( # Hence Z Score
    data_flat.groupby("Ticker")["Volume"]
    .transform(lambda x: (x - x.rolling(20).mean()) / x.rolling(20).std())
)

In [19]:
# Macro Sentiment
# VIX Index
macro = yf.download(["^VIX"], start="2015-01-01", end="2025-01-01")

# Flatten Columns
macro.columns = [col[0] for col in macro.columns]  # Just Frst Level (Open, Close, High, Low, Volume)

# Check
print(macro.columns.tolist())
macro.head()

# Close for VIX Feature
macro = macro.reset_index().rename(columns={"Close":"VIX"})

# Merge in Main Data
data_flat = data_flat.merge(macro[["Date","VIX"]], on="Date", how="left")

  macro = yf.download(["^VIX"], start="2015-01-01", end="2025-01-01")
[*********************100%***********************]  1 of 1 completed

['Close', 'High', 'Low', 'Open', 'Volume']





In [20]:
# Labels
data_flat["Direction"] = (data_flat["Return"] > 0).astype(int)

'''
data_flat["VolSpike"] = (
    data_flat.groupby("Ticker")["RollingVol"]
    .transform(lambda x: x > x.quantile(0.8))
    .astype(int)

)
'''

# Create Volatility Spike Label
# FIX: Use 'expanding().quantile(0.8)' instead of global 'quantile(0.8)'
# WHY: The global quantile uses data from 2025 to judge volatility in 2016 (Look-Ahead Bias).
# An expanding window ensures we only define "high volatility" based on what we knew AT THAT TIME.
data_flat["VolSpike"] = (
    data_flat.groupby("Ticker")["RollingVol"]
    .apply(lambda x: x > x.expanding().quantile(0.8))
    .astype(int)
    .reset_index(level=0, drop=True) 
)

In [21]:
from pathlib import Path
Path('data').mkdir(exist_ok=True)

In [22]:
# Export
data_flat_clean = data_flat.dropna()

# Saved
data_flat_clean.to_csv("data/merged_features.csv", index=False)

data_flat_clean.head()
data_flat_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 34538 entries, 686 to 35223
Data columns (total 16 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   Date        34538 non-null  datetime64[ns]
 1   Ticker      34538 non-null  object        
 2   Close       34538 non-null  float64       
 3   High        34538 non-null  float64       
 4   Low         34538 non-null  float64       
 5   Open        34538 non-null  float64       
 6   Volume      34538 non-null  int64         
 7   Return      34538 non-null  float64       
 8   RollingVol  34538 non-null  float64       
 9   RSI         34538 non-null  float64       
 10  SMA_20      34538 non-null  float64       
 11  SMA_50      34538 non-null  float64       
 12  Volume_Z    34538 non-null  float64       
 13  VIX         34538 non-null  float64       
 14  Direction   34538 non-null  int64         
 15  VolSpike    34538 non-null  int64         
dtypes: datetime64[ns](1), flo