In [270]:
import pandas as pd
import numpy as np
import os, pickle
import warnings
import datetime
warnings.filterwarnings('ignore')

In [271]:
path_to_data = r"../../data/raw/daily stock data"

In [272]:
# Get the list of all csv files in path_to_data and all subfolders
csv_files = []
for root, dirs, files in os.walk(path_to_data):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

In [273]:
# Loop through the files, limiting columns and appending airline tickers to a df
for file in csv_files:
    ticker = file.split('_')[0].split('\\')[-1].upper()  # Extract ticker from filename

    if file == csv_files[0]:
        df_main = pd.read_csv(file)
        df_main['ticker'] = ticker
    else:
        df_temp = pd.read_csv(file)
        df_temp['ticker'] = ticker
        df_main = pd.concat([df_main, df_temp], ignore_index=True)
df_main['ticker'].value_counts()

ticker
AAL     2115
ALGT    2115
ALK     2115
BNO     2115
DAL     2115
ITA     2115
IYT     2115
JBLU    2115
JETS    2115
LUV     2115
UAL     2115
Name: count, dtype: int64

In [274]:
# Drop duplicates rows
df_main = df_main.drop_duplicates()
df_main = df_main.dropna()

In [275]:
# Convert 'Time' column to datetime format
df_main['Time'] = pd.to_datetime(df_main['Time'], format='%Y-%m-%d')
df_main['%Chg'] = df_main['%Chg'].str.replace('%', '').astype(float)

In [276]:
# Finance variables
df_main['High-Low']   =  df_main['High'] - df_main['Low']
df_main['High-Low%']  = (df_main['High'] / df_main['Open']) - 1
df_main['Last-Open']  =  df_main['Open'] - df_main['Last']
df_main['Last-Open%'] = (df_main['Open'] / df_main['Last']) - 1

df_main['Log_Volume'] = np.log(df_main['Volume'])

df_main.sort_values(by=['ticker', 'Time'], inplace=True)

# 10-period rolling variance
df_main['Roll_SD_Last_10']   = df_main.groupby('ticker')['Last'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Last_10'] = df_main.groupby('ticker')['Roll_SD_Last_10'].rolling(window=10).std().reset_index(level=0, drop=True)

df_main['Roll_SD_Volume_10']   = df_main.groupby('ticker')['Volume'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Volume_10'] = df_main.groupby('ticker')['Roll_SD_Volume_10'].rolling(window=10).std().reset_index(level=0, drop=True)

for i in ['Last', 'Volume', 'High-Low', 'Last-Open', 'Roll_SD_Last_10', 'Roll_SDSD_Last_10', 'Roll_SD_Volume_10', 'Roll_SDSD_Volume_10']:
    df_main[f'Change_{i}'] = df_main.groupby('ticker')[i].diff()

In [277]:
# Add log dollar-volume shock (Goyenko et al. 2025)
df_main['avg_price'] = df_main[['Open', 'High', 'Low', 'Last']].mean(axis=1)
df_main['Dollar_Volume'] = df_main['avg_price'] * df_main['Volume']
df_main['Log_Dollar_Volume'] = np.log(df_main['Dollar_Volume'])
df_main['Log_Dollar_Volume_ma05'] = df_main.groupby('ticker')['Log_Dollar_Volume'].transform(lambda x: x.rolling(window=5).mean())

df_main.drop(columns=['avg_price', 'Dollar_Volume'], inplace=True)

In [278]:
df_main

Unnamed: 0,Time,Open,High,Low,Last,Change,%Chg,Volume,ticker,High-Low,...,Change_Last,Change_Volume,Change_High-Low,Change_Last-Open,Change_Roll_SD_Last_10,Change_Roll_SDSD_Last_10,Change_Roll_SD_Volume_10,Change_Roll_SDSD_Volume_10,Log_Dollar_Volume,Log_Dollar_Volume_ma05
0,2017-01-03,47.28,47.3400,46.13,46.300,-0.390,-0.84,6737700.0,AAL,1.2100,...,,,,,,,,,19.568311,
1,2017-01-04,46.63,47.4400,46.35,46.700,0.400,0.86,5859600.0,AAL,1.0900,...,0.400,-878100.0,-0.1200,-1.050,,,,,19.429048,
2,2017-01-05,46.52,46.9300,45.61,45.890,-0.810,-1.73,6825300.0,AAL,1.3200,...,-0.810,965700.0,0.2300,0.700,,,,,19.569938,
3,2017-01-06,45.85,46.7200,45.47,46.210,0.320,0.70,7260100.0,AAL,1.2500,...,0.320,434800.0,-0.0700,-0.990,,,,,19.627903,
4,2017-01-09,46.01,47.3400,45.78,47.080,0.870,1.88,4739100.0,AAL,1.5600,...,0.870,-2521000.0,0.3100,-0.710,,,,,19.211939,19.481428
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23259,2025-05-23,73.83,75.4150,73.77,74.650,-1.350,-1.78,4806500.0,UAL,1.6450,...,-1.350,-3487000.0,-1.8200,2.070,0.177687,-0.041326,377671.289594,-4763.069495,19.695154,20.086270
23260,2025-05-27,76.02,79.4900,75.90,78.180,3.530,4.73,7898200.0,UAL,3.5900,...,3.530,3091700.0,1.9450,-1.340,-0.020037,-0.078567,-498335.077964,14161.633804,20.231100,20.131727
23261,2025-05-28,77.87,78.4100,77.04,77.500,-0.680,-0.87,3911800.0,UAL,1.3700,...,-0.680,-3986400.0,-2.2200,2.530,-0.772227,-0.179446,193546.373682,-25006.832360,19.532428,19.986758
23262,2025-05-29,79.04,80.7600,77.01,78.570,1.070,1.38,10090000.0,UAL,3.7500,...,1.070,6178200.0,2.3800,0.100,-0.052266,-0.163724,178541.112008,-25352.166768,20.494539,20.039353


In [279]:
df_main.drop(columns=[
    'Open', 'High', 'Low', '%Chg', 'High-Low', 'High-Low%', 'Last-Open', 'Last-Open%',
       'Roll_SD_Last_10', 'Roll_SDSD_Last_10', 'Roll_SD_Volume_10', 'Change', 'Roll_SDSD_Volume_10'
       ], inplace=True)

In [280]:
windows=[4,8,16,26]
vars=['Last', 'Log_Volume', 'Volume', 'Change_Last', 'Change_Volume',
       'Change_High-Low', 'Change_Last-Open', 'Change_Roll_SD_Last_10',
       'Change_Roll_SDSD_Last_10', 'Change_Roll_SD_Volume_10',
       'Change_Roll_SDSD_Volume_10']

# Moving averages
for window in windows:
    for var in vars:
        df_main[f'{var}_ma{window:02d}'] = df_main.groupby('ticker')[var].transform(lambda x: x.rolling(window=window).mean())

In [281]:
# Calculate lags
vars = ['Change_High-Low','Change_High-Low_ma04','Change_High-Low_ma08','Change_High-Low_ma16','Change_High-Low_ma26','Change_Last-Open','Change_Last-Open_ma04','Change_Last-Open_ma08','Change_Last-Open_ma16','Change_Last-Open_ma26','Change_Last_ma04','Change_Last_ma08','Change_Last_ma16','Change_Last_ma26','Change_Roll_SDSD_Last_10','Change_Roll_SDSD_Last_10_ma04','Change_Roll_SDSD_Last_10_ma08','Change_Roll_SDSD_Last_10_ma16','Change_Roll_SDSD_Last_10_ma26','Change_Roll_SDSD_Volume_10','Change_Roll_SDSD_Volume_10_ma04','Change_Roll_SDSD_Volume_10_ma08','Change_Roll_SDSD_Volume_10_ma16','Change_Roll_SDSD_Volume_10_ma26','Change_Roll_SD_Last_10','Change_Roll_SD_Last_10_ma04','Change_Roll_SD_Last_10_ma08','Change_Roll_SD_Last_10_ma16','Change_Roll_SD_Last_10_ma26','Change_Roll_SD_Volume_10','Change_Roll_SD_Volume_10_ma04','Change_Roll_SD_Volume_10_ma08','Change_Roll_SD_Volume_10_ma16','Change_Roll_SD_Volume_10_ma26','Change_Volume_ma04','Change_Volume_ma08','Change_Volume_ma16','Change_Volume_ma26','Last_ma04','Last_ma08','Last_ma16','Last_ma26','Volume_ma04','Volume_ma08','Volume_ma16','Volume_ma26', 'Log_Dollar_Volume_ma05']

for var in vars:
    df_main[f'{var}_lag01'] = df_main.groupby('ticker')[var].shift(1)

vars=['Last', 'Log_Volume', 'Volume', 'Change_Last', 'Change_Volume', 'Log_Dollar_Volume']

for l in range(1, 27):
    for var in vars:
        df_main[f'{var}_lag{l:02d}'] = df_main.groupby('ticker')[var].shift(l)

In [282]:
df_main['Log_Dollar_Volume_shock'] = df_main['Log_Dollar_Volume'] - df_main[f'Log_Dollar_Volume_ma05_lag01']

In [283]:
# reshape the data to wide format, adding the ticker as a prefix to all columns
to_reshape = df_main.columns.difference(['Time', 'ticker'])
df_wide = df_main.pivot_table(index='Time', columns='ticker', values=to_reshape, aggfunc='first')
# Forward fill missing values
df_wide = df_wide.ffill()

In [284]:
# Flatten the MultiIndex column names
df_wide.columns = [f'{ticker}_{col}' for col, ticker in df_wide.columns]
df_wide = df_wide[df_wide.index >= datetime.datetime(2018, 1, 1, 0, 0)]

In [285]:
# Sort columns alphabetically
df_wide = df_wide.reindex(sorted(df_wide.columns), axis=1)

In [286]:
# Create day of week dummies
df_wide['day_of_week']   = df_wide.index.dayofweek
df_wide['month_of_year'] = df_wide.index.month

# Create dummy variables for categorical features
df_wide = pd.get_dummies(df_wide, columns=['day_of_week', 'month_of_year'], drop_first=False)

In [287]:
# Save the processed DataFrame to a pickle file
output_path = r"../../data/processed/stock_data_daily.pkl"
with open(output_path, 'wb') as f:
    pickle.dump(df_wide, f)

In [288]:
df_main = df_main.dropna()
# Create day of week dummies
df_main['day_of_week']   = df_main['Time'].dt.dayofweek
df_main['month_of_year'] = df_main['Time'].dt.month

# Create dummy variables for categorical features
df_main = pd.get_dummies(df_main, columns=['day_of_week', 'month_of_year'], drop_first=False)
df_main.sort_values(by=['Time', 'ticker'], inplace=True)
df_main = df_main[~df_main['ticker'].isin(['BNO', 'JETS', 'IYT', 'ITA'])]

In [289]:
df_wide = df_wide[[i for i in df_wide.columns if any(j in i for j in ['BNO', 'JETS', 'IYT', 'ITA'])]]

In [290]:
df_main = pd.merge(df_main, df_wide, on='Time', how='inner')
df_main.sort_values(by=['Time', 'ticker'], inplace=True)
df_main.index = df_main[['Time','ticker']]
df_main['date'] = df_main['Time']
df_main.drop(columns=['Time'], inplace=True)

In [291]:
df_main

Unnamed: 0,Last,Volume,ticker,Log_Volume,Change_Last,Change_Volume,Change_High-Low,Change_Last-Open,Change_Roll_SD_Last_10,Change_Roll_SDSD_Last_10,...,JETS_Volume_lag26,JETS_Volume_ma04,JETS_Volume_ma04_lag01,JETS_Volume_ma08,JETS_Volume_ma08_lag01,JETS_Volume_ma16,JETS_Volume_ma16_lag01,JETS_Volume_ma26,JETS_Volume_ma26_lag01,date
"(2018-01-02 00:00:00, AAL)",52.990,4084700.0,AAL,15.222759,0.960,1387700.0,0.3900,-1.050,-0.063979,0.010258,...,15800.0,139675.0,124125.0,78775.0,70650.0,49393.75,45550.0,4.627308e+04,4.392692e+04,2018-01-02
"(2018-01-02 00:00:00, ALGT)",154.050,171500.0,ALGT,12.052339,-0.700,80300.0,5.2200,4.100,-0.561770,-0.162288,...,15800.0,139675.0,124125.0,78775.0,70650.0,49393.75,45550.0,4.627308e+04,4.392692e+04,2018-01-02
"(2018-01-02 00:00:00, ALK)",75.010,1738000.0,ALK,14.368246,1.500,699400.0,-0.0800,-2.130,-0.541742,0.065670,...,15800.0,139675.0,124125.0,78775.0,70650.0,49393.75,45550.0,4.627308e+04,4.392692e+04,2018-01-02
"(2018-01-02 00:00:00, DAL)",56.740,5297600.0,DAL,15.482764,0.740,2177500.0,0.1700,-0.980,0.084709,0.043902,...,15800.0,139675.0,124125.0,78775.0,70650.0,49393.75,45550.0,4.627308e+04,4.392692e+04,2018-01-02
"(2018-01-02 00:00:00, JBLU)",22.380,3414600.0,JBLU,15.043571,0.040,285800.0,0.0300,-0.180,-0.112106,-0.001107,...,15800.0,139675.0,124125.0,78775.0,70650.0,49393.75,45550.0,4.627308e+04,4.392692e+04,2018-01-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"(2025-05-30 00:00:00, ALK)",50.930,3024500.0,ALK,14.922256,-1.100,-1241100.0,-0.3700,0.310,-0.021026,-0.138338,...,2275500.0,3504625.0,2637275.0,2844075.0,2409350.0,2792918.75,2581712.5,2.336377e+06,2.241846e+06,2025-05-30
"(2025-05-30 00:00:00, DAL)",48.390,9190700.0,DAL,16.033703,-0.110,127400.0,-0.3100,-0.820,-0.013260,-0.096059,...,2275500.0,3504625.0,2637275.0,2844075.0,2409350.0,2792918.75,2581712.5,2.336377e+06,2.241846e+06,2025-05-30
"(2025-05-30 00:00:00, JBLU)",5.050,35732500.0,JBLU,17.391571,-0.010,-5765100.0,-0.2204,-0.290,-0.005736,-0.001971,...,2275500.0,3504625.0,2637275.0,2844075.0,2409350.0,2792918.75,2581712.5,2.336377e+06,2.241846e+06,2025-05-30
"(2025-05-30 00:00:00, LUV)",33.380,19814500.0,LUV,16.801925,-0.330,6468900.0,-0.3489,0.060,0.051444,0.010843,...,2275500.0,3504625.0,2637275.0,2844075.0,2409350.0,2792918.75,2581712.5,2.336377e+06,2.241846e+06,2025-05-30


In [292]:
# Save the processed DataFrame to a pickle file
output_path = r"../../data/processed/stock_data_daily_long.pkl"
with open(output_path, 'wb') as f:
    pickle.dump(df_main, f)