In [26]:
import pandas as pd
import numpy as np
import os, pickle
import warnings
import datetime
warnings.filterwarnings('ignore')

In [27]:
path_to_data = r"../../data/raw/daily stock data"

In [28]:
# Get the list of all csv files in path_to_data and all subfolders
csv_files = []
for root, dirs, files in os.walk(path_to_data):
    for f in files:
        if f.endswith('.csv'):
            csv_files.append(os.path.join(root, f))

In [29]:
# Loop through the files, limiting columns and appending airline tickers to a df
for file in csv_files:
    ticker = file.split('_')[0].split('\\')[-1].upper()  # Extract ticker from filename

    if file == csv_files[0]:
        df_main = pd.read_csv(file)
        df_main['ticker'] = ticker
    else:
        df_temp = pd.read_csv(file)
        df_temp['ticker'] = ticker
        df_main = pd.concat([df_main, df_temp], ignore_index=True)
df_main['ticker'].value_counts()

ticker
AAL     2115
ALGT    2115
ALK     2115
BNO     2115
DAL     2115
ITA     2115
IYT     2115
JBLU    2115
JETS    2115
LUV     2115
UAL     2115
Name: count, dtype: int64

In [30]:
# Drop duplicates rows
df_main = df_main.drop_duplicates()
df_main = df_main.dropna()

In [31]:
# Convert 'Time' column to datetime format
df_main['Time'] = pd.to_datetime(df_main['Time'], format='%Y-%m-%d')
df_main['%Chg'] = df_main['%Chg'].str.replace('%', '').astype(float)

In [None]:
# Finance variables
df_main['High-Low']   =  df_main['High'] - df_main['Low']
df_main['High-Low%']  = (df_main['High'] / df_main['Open']) - 1
df_main['Last-Open']  =  df_main['Open'] - df_main['Last']
df_main['Last-Open%'] = (df_main['Open'] / df_main['Last']) - 1

df_main['Log_Volume'] = np.log(df_main['Volume'])

df_main.sort_values(by=['ticker', 'Time'], inplace=True)

# 10-period rolling variance
df_main['Roll_SD_Last_10']   = df_main.groupby('ticker')['Last'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Last_10'] = df_main.groupby('ticker')['Roll_SD_Last_10'].rolling(window=10).std().reset_index(level=0, drop=True)

df_main['Roll_SD_Volume_10']   = df_main.groupby('ticker')['Volume'].rolling(window=10).std().reset_index(level=0, drop=True)
df_main['Roll_SDSD_Volume_10'] = df_main.groupby('ticker')['Roll_SD_Volume_10'].rolling(window=10).std().reset_index(level=0, drop=True)

for i in ['Last', 'Volume', 'High-Low', 'Last-Open', 'Roll_SD_Last_10', 'Roll_SDSD_Last_10', 'Roll_SD_Volume_10', 'Roll_SDSD_Volume_10']:
    df_main[f'Change_{i}'] = df_main.groupby('ticker')[i].diff()

In [None]:
# Add log dollar-volume shock (Goyenko et al. 2025)
df_main['avg_price'] = df_main[['Open', 'High', 'Low', 'Last']].mean(axis=1)
df_main['Dollar_Volume'] = df_main['avg_price'] * df_main['Volume']
df_main['Log_Dollar_Volume'] = np.log(df_main['Dollar_Volume'])
df_main['Log_Dollar_Volume_ma05'] = df_main.groupby('ticker')['Log_Dollar_Volume'].transform(lambda x: x.rolling(window=5).mean())
df_main['Log_Dollar_Volume_shock'] = df_main['Log_Dollar_Volume'] - df_main[f'Log_Dollar_Volume_ma05']

df_main.drop(columns=['avg_price', 'Dollar_Volume', 'Log_Dollar_Volume_ma05'], inplace=True)

In [33]:
df_main.drop(columns=[
    'Open', 'High', 'Low', '%Chg', 'High-Low', 'High-Low%', 'Last-Open', 'Last-Open%',
       'Roll_SD_Last_10', 'Roll_SDSD_Last_10', 'Roll_SD_Volume_10', 'Change', 'Roll_SDSD_Volume_10'
       ], inplace=True)

In [34]:
windows=[4,8,16,26]
vars=['Last', 'Log_Volume', 'Volume', 'Change_Last', 'Change_Volume',
       'Change_High-Low', 'Change_Last-Open', 'Change_Roll_SD_Last_10',
       'Change_Roll_SDSD_Last_10', 'Change_Roll_SD_Volume_10',
       'Change_Roll_SDSD_Volume_10']

# Moving averages
for window in windows:
    for var in vars:
        df_main[f'{var}_ma{window:02d}'] = df_main.groupby('ticker')[var].transform(lambda x: x.rolling(window=window).mean())

In [None]:
# Calculate lags
vars = ['Change_High-Low','Change_High-Low_ma04','Change_High-Low_ma08','Change_High-Low_ma16','Change_High-Low_ma26','Change_Last-Open','Change_Last-Open_ma04','Change_Last-Open_ma08','Change_Last-Open_ma16','Change_Last-Open_ma26','Change_Last_ma04','Change_Last_ma08','Change_Last_ma16','Change_Last_ma26','Change_Roll_SDSD_Last_10','Change_Roll_SDSD_Last_10_ma04','Change_Roll_SDSD_Last_10_ma08','Change_Roll_SDSD_Last_10_ma16','Change_Roll_SDSD_Last_10_ma26','Change_Roll_SDSD_Volume_10','Change_Roll_SDSD_Volume_10_ma04','Change_Roll_SDSD_Volume_10_ma08','Change_Roll_SDSD_Volume_10_ma16','Change_Roll_SDSD_Volume_10_ma26','Change_Roll_SD_Last_10','Change_Roll_SD_Last_10_ma04','Change_Roll_SD_Last_10_ma08','Change_Roll_SD_Last_10_ma16','Change_Roll_SD_Last_10_ma26','Change_Roll_SD_Volume_10','Change_Roll_SD_Volume_10_ma04','Change_Roll_SD_Volume_10_ma08','Change_Roll_SD_Volume_10_ma16','Change_Roll_SD_Volume_10_ma26','Change_Volume_ma04','Change_Volume_ma08','Change_Volume_ma16','Change_Volume_ma26','Last_ma04','Last_ma08','Last_ma16','Last_ma26','Volume_ma04','Volume_ma08','Volume_ma16','Volume_ma26']

for var in vars:
    df_main[f'{var}_lag01'] = df_main.groupby('ticker')[var].shift(1)

vars=['Last', 'Log_Volume', 'Volume', 'Change_Last', 'Change_Volume', 'Log_Dollar_Volume']

for l in range(1, 27):
    for var in vars:
        df_main[f'{var}_lag{l:02d}'] = df_main.groupby('ticker')[var].shift(l)

In [36]:
# reshape the data to wide format, adding the ticker as a prefix to all columns
to_reshape = df_main.columns.difference(['Time', 'ticker'])
df_wide = df_main.pivot_table(index='Time', columns='ticker', values=to_reshape, aggfunc='first')
# Forward fill missing values
df_wide = df_wide.ffill()

In [37]:
# Flatten the MultiIndex column names
df_wide.columns = [f'{ticker}_{col}' for col, ticker in df_wide.columns]
df_wide = df_wide[df_wide.index >= datetime.datetime(2018, 1, 1, 0, 0)]

In [38]:
# Sort columns alphabetically
df_wide = df_wide.reindex(sorted(df_wide.columns), axis=1)

In [39]:
# Create day of week dummies
df_wide['day_of_week']   = df_wide.index.dayofweek
df_wide['month_of_year'] = df_wide.index.month

# Create dummy variables for categorical features
df_wide = pd.get_dummies(df_wide, columns=['day_of_week', 'month_of_year'], drop_first=False)

In [40]:
df_wide

Unnamed: 0_level_0,AAL_Change_High-Low,AAL_Change_High-Low_lag01,AAL_Change_High-Low_ma04,AAL_Change_High-Low_ma04_lag01,AAL_Change_High-Low_ma08,AAL_Change_High-Low_ma08_lag01,AAL_Change_High-Low_ma16,AAL_Change_High-Low_ma16_lag01,AAL_Change_High-Low_ma26,AAL_Change_High-Low_ma26_lag01,...,month_of_year_3,month_of_year_4,month_of_year_5,month_of_year_6,month_of_year_7,month_of_year_8,month_of_year_9,month_of_year_10,month_of_year_11,month_of_year_12
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-01-02,0.3900,0.4000,0.078750,0.087500,0.083750,0.050000,-0.026250,0.011875,0.019231,-0.021923,...,False,False,False,False,False,False,False,False,False,False
2018-01-03,-0.3950,0.3900,-0.003750,0.078750,-0.015625,0.083750,-0.003437,-0.026250,0.015192,0.019231,...,False,False,False,False,False,False,False,False,False,False
2018-01-04,1.3300,-0.3950,0.431250,-0.003750,0.091875,-0.015625,0.081562,-0.003437,0.061731,0.015192,...,False,False,False,False,False,False,False,False,False,False
2018-01-05,-1.7250,1.3300,-0.100000,0.431250,-0.006250,0.091875,-0.046250,0.081562,-0.031154,0.061731,...,False,False,False,False,False,False,False,False,False,False
2018-01-08,0.3000,-1.7250,-0.122500,-0.100000,-0.021875,-0.006250,-0.011875,-0.046250,-0.068846,-0.031154,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-05-23,-0.1167,0.0067,-0.041250,0.016675,-0.048125,-0.029162,0.001250,-0.008331,-0.016538,-0.000319,...,False,False,True,False,False,False,False,False,False,False
2025-05-27,0.1700,-0.1167,0.008750,-0.041250,-0.024375,-0.048125,-0.000625,0.001250,0.006154,-0.016538,...,False,False,True,False,False,False,False,False,False,False
2025-05-28,-0.0800,0.1700,-0.005000,0.008750,-0.006875,-0.024375,-0.014375,-0.000625,-0.000385,0.006154,...,False,False,True,False,False,False,False,False,False,False
2025-05-29,0.0100,-0.0800,-0.004175,-0.005000,0.006250,-0.006875,0.009375,-0.014375,0.002692,-0.000385,...,False,False,True,False,False,False,False,False,False,False


In [41]:
# Save the processed DataFrame to a pickle file
output_path = r"../../data/processed/stock_data_daily.pkl"
with open(output_path, 'wb') as f:
    pickle.dump(df_wide, f)